From 4cae6b37037a97c8d4499af0993b2adc4e5a24de Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Wed, 22 May 2024 14:51:08 -0700 Subject: [PATCH] Migrate `serialize_v0` to new API. This is the middle layer of the API design work (#172). We add a manifest abstract class to represent various manifests (#111 #112) and also ways to serialize a model directory into manifests and ways to verify the manifests. For now, this only does what was formerly known as `serialize_v0`. The v1 and the manifest versions will come soon. Note: This has a lot of inspiration from #112, but makes the API work with all the usecases we need to consider right now. Signed-off-by: Mihai Maruseac --- model_signing/manifest/__init__.py | 13 ++ model_signing/manifest/manifest.py | 39 ++++ model_signing/serializing/__init__.py | 13 ++ model_signing/serializing/dfs.py | 105 +++++++++ model_signing/serializing/dfs_test.py | 281 +++++++++++++++++++++++ model_signing/serializing/serializing.py | 33 +++ 6 files changed, 484 insertions(+) create mode 100644 model_signing/manifest/__init__.py create mode 100644 model_signing/manifest/manifest.py create mode 100644 model_signing/serializing/__init__.py create mode 100644 model_signing/serializing/dfs.py create mode 100644 model_signing/serializing/dfs_test.py create mode 100644 model_signing/serializing/serializing.py diff --git a/model_signing/manifest/__init__.py b/model_signing/manifest/__init__.py new file mode 100644 index 00000000..0888a055 --- /dev/null +++ b/model_signing/manifest/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_signing/manifest/manifest.py b/model_signing/manifest/manifest.py new file mode 100644 index 00000000..29cbc0d8 --- /dev/null +++ b/model_signing/manifest/manifest.py @@ -0,0 +1,39 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machinery for representing a serialized representation of an ML model. + +Currently, we only support a manifest that wraps around a digest. But, to +support incremental updates and partial signature verification, we need a +manifest that lists files and their digests. That will come in a future change, +soon. +""" + +from abc import ABCMeta +from dataclasses import dataclass + +from model_signing.hashing import hashing + + +class Manifest(metaclass=ABCMeta): + """Generic manifest file to represent a model.""" + + pass + + +@dataclass +class DigestManifest(Manifest): + """A manifest that is just a hash.""" + + digest: hashing.Digest diff --git a/model_signing/serializing/__init__.py b/model_signing/serializing/__init__.py new file mode 100644 index 00000000..0888a055 --- /dev/null +++ b/model_signing/serializing/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_signing/serializing/dfs.py b/model_signing/serializing/dfs.py new file mode 100644 index 00000000..33b51692 --- /dev/null +++ b/model_signing/serializing/dfs.py @@ -0,0 +1,105 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model serializers that build a single hash out of a DFS traversal.""" + +import pathlib +from typing import Callable +from model_signing.hashing import file +from model_signing.hashing import hashing +from model_signing.manifest import manifest +from model_signing.serializing import serializing +from typing_extensions import override + + +def _check_file_or_directory(path: pathlib.Path) -> bool: + """Checks that the given path is either a file or a directory.""" + return path.is_file() or path.is_dir() + + +def _build_header(*, entry_name: str, entry_type: str) -> bytes: + """Builds a header to encode a path with given name and type. + + Args: + entry_name: The name of the entry to build the header for. + entry_type: The type of the entry (file or directory). + """ + encoded_type = entry_type.encode("utf-8") + encoded_name = entry_name.encode("utf-8") + return b".".join([encoded_type, encoded_name, b""]) + + +class DFSSerializer(serializing.Serializer): + """Serializer for a model that performs a traversal of the model directory. + + This serializer produces a single hash for the entire model. If the model is + a file, the hash is the digest of the file. If the model is a directory, we + perform a depth-first traversal of the directory, hash each individual files + and aggregate the hashes together. + """ + + def __init__( + self, + file_hasher: file.FileHasher, + merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], + ): + """Initializes an instance to hash a file with a specific `HashEngine`. + + Args: + hasher: The hash engine used to hash the individual files. + merge_hasher_factory: A callable that returns a + `hashing.StreamingHashEngine` instance used to merge individual + file digests to compute an aggregate digest. + """ + self._file_hasher = file_hasher + self._merge_hasher_factory = merge_hasher_factory + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: + # TODO(mihaimaruseac): Add checks for symlinks + if not _check_file_or_directory(model_path): + raise ValueError( + f"Must have a file or directory, but '{model_path}' is neither." + ) + + if model_path.is_file(): + self._file_hasher.set_file(model_path) + return manifest.DigestManifest(self._file_hasher.compute()) + + return manifest.DigestManifest(self._dfs(model_path)) + + def _dfs(self, directory: pathlib.Path) -> hashing.Digest: + # TODO(mihaimaruseac): Add support for excluded files + children = sorted([x for x in directory.iterdir()]) + + hasher = self._merge_hasher_factory() + for child in children: + if not _check_file_or_directory(child): + raise ValueError( + f"Must have a file or directory, but '{child}' is neither." + ) + + if child.is_file(): + header = _build_header(entry_name=child.name, entry_type="file") + hasher.update(header) + self._file_hasher.set_file(child) + digest = self._file_hasher.compute() + hasher.update(digest.digest_value) + else: + header = _build_header(entry_name=child.name, entry_type="dir") + hasher.update(header) + digest = self._dfs(child) + hasher.update(digest.digest_value) + + return hasher.compute() diff --git a/model_signing/serializing/dfs_test.py b/model_signing/serializing/dfs_test.py new file mode 100644 index 00000000..b0794321 --- /dev/null +++ b/model_signing/serializing/dfs_test.py @@ -0,0 +1,281 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from model_signing.hashing import file +from model_signing.hashing import memory +from model_signing.serializing import dfs +import pytest + + +# some constants used throughout testing +_KNOWN_MODEL_TEXT: bytes = b"This is a simple model" +_ANOTHER_MODEL_TEXT: bytes = b"This is another simple model" + + +# Note: Don't make fixtures with global scope as we are altering the models! +@pytest.fixture +def sample_model_file(tmp_path_factory): + file = tmp_path_factory.mktemp("model") / "file" + file.write_bytes(_KNOWN_MODEL_TEXT) + return file + + +@pytest.fixture +def empty_model_file(tmp_path_factory): + file = tmp_path_factory.mktemp("model") / "file" + file.write_bytes(b"") + return file + + +@pytest.fixture +def sample_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + + for i in range(2): + root_dir = model_root / f"d{i}" + root_dir.mkdir() + for j in range(3): + dir_file = root_dir / f"f{i}{j}" + dir_file.write_text(f"This is file f{i}{j} in d{i}.") + + for i in range(4): + root_file = model_root / f"f{i}" + root_file.write_text(f"This is file f{i} in root.") + + return model_root + + +@pytest.fixture +def empty_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + return model_root + + +@pytest.fixture +def deep_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + + current = model_root + for i in range(5): + current = current / f"d{i}" + current.mkdir() + + for i in range(4): + file = current / f"f{i}" + file.write_text(f"This is file f{i}.") + + return model_root + + +class TestDFSSerializer: + + def test_known_file(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + expected = ( + "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" + ) + assert manifest.digest.digest_hex == expected + + def test_file_hash_is_same_as_hash_of_content(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex == digest.digest_hex + + def test_file_model_hash_is_same_if_model_is_moved(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + + new_name = sample_model_file.with_name("new-file") + new_file = sample_model_file.rename(new_name) + new_manifest = serializer.serialize(new_file) + + assert manifest == new_manifest + + def test_file_model_hash_changes_if_content_changes( + self, sample_model_file + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + + sample_model_file.write_bytes(_ANOTHER_MODEL_TEXT) + new_manifest = serializer.serialize(sample_model_file) + + assert manifest.digest.algorithm == new_manifest.digest.algorithm + assert manifest.digest.digest_value != new_manifest.digest.digest_value + + def test_directory_model_with_only_known_file(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + model = sample_model_file.parent + manifest = serializer.serialize(model) + + expected = ( + "aa856f565699473579c8d7009bfad8c421e1643b810f0a28d47b9ce1f0b98ccc" + ) + assert manifest.digest.digest_hex == expected + + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex != digest.digest_hex + + def test_known_folder(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_folder) + expected = ( + "516de24dd65c9749bbde333545cb997c645e21c510107fa5c06428e0df84099b" + ) + assert manifest.digest.digest_hex == expected + + def test_folder_model_hash_is_same_if_model_is_moved( + self, sample_model_folder + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_folder) + + new_name = sample_model_folder.with_name("new-root") + new_model = sample_model_folder.rename(new_name) + new_manifest = serializer.serialize(new_model) + + assert manifest == new_manifest + + def test_empty_file(self, empty_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_file) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_directory_model_with_only_empty_file(self, empty_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_file) + model = empty_model_file.parent + manifest = serializer.serialize(model) + expected = ( + "ca671f6b24ce1b08677759ed050a30eb86a28c18abfa2308c7da9e581a8f7917" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder(self, empty_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_folder) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder_hashes_the_same_as_empty_file( + self, empty_model_file, empty_model_folder + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + folder_manifest = serializer.serialize(empty_model_folder) + file_manifest = serializer.serialize(empty_model_file) + assert ( + folder_manifest.digest.digest_hex == file_manifest.digest.digest_hex + ) + + def test_folder_model_empty_entry(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + new_empty_dir = altered_dir / "empty" + new_empty_dir.mkdir() + manifest1 = serializer.serialize(sample_model_folder) + + new_empty_dir.rmdir() + + new_empty_file = altered_dir / "empty" + new_empty_file.write_text("") + manifest2 = serializer.serialize(sample_model_folder) + + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_file(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_rename = files[0] + + new_name = file_to_rename.with_name("new-file") + new_file = file_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_dir(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + dir_to_rename = dirs[0] + + new_name = dir_to_rename.with_name("new-dir") + new_file = dir_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_change_file(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_change = files[0] + file_to_change.write_bytes(_KNOWN_MODEL_TEXT) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_deep_folder(self, deep_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(deep_model_folder) + expected = ( + "1ae1b8a653dba20787ae8482611761ee7f1223b15fbfbaa1fce5c55751048d62" + ) + assert manifest.digest.digest_hex == expected diff --git a/model_signing/serializing/serializing.py b/model_signing/serializing/serializing.py new file mode 100644 index 00000000..50c8f729 --- /dev/null +++ b/model_signing/serializing/serializing.py @@ -0,0 +1,33 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machinery for serializing ML models. + +Currently we have only one serializer that performs a DFS traversal of the model +directory, but more serializers are coming soon. +""" + +from abc import ABCMeta, abstractmethod +import pathlib + +from model_signing.manifest import manifest + + +class Serializer(metaclass=ABCMeta): + """Generic ML model format serializer.""" + + @abstractmethod + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: + """Serializes the model given by the `model_path` argument.""" + pass