Merge pull request #920 from lsst/tickets/DM-41879

DM-41879: Implement URL signing for client/server
lsst · Dec 8, 2023 · d550cd6 · d550cd6
2 parents 1e7b397 + a633a1a
commit d550cd6
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 38 deletions.
diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py
@@ -1998,10 +1998,23 @@ def get(
     def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
         # Docstring inherited
 
+        # 1 hour.  Chosen somewhat arbitrarily -- this is long enough that the
+        # client should have time to download a large file with retries if
+        # needed, but short enough that it will become obvious quickly that
+        # these URLs expire.
+        # From a strictly technical standpoint there is no reason this
+        # shouldn't be a day or more, but there seems to be a political issue
+        # where people think there is a risk of end users posting presigned
+        # URLs for people without access rights to download.
+        url_expiration_time_seconds = 1 * 60 * 60
+
         def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
             location, file_info = info
             return FileDatastoreGetPayloadFileInfo(
-                url=location.uri.geturl(), datastoreRecords=file_info.to_simple()
+                url=location.uri.generate_presigned_get_url(
+                    expiration_time_seconds=url_expiration_time_seconds
+                ),
+                datastoreRecords=file_info.to_simple(),
             )
 
         return FileDatastoreGetPayload(

diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py
@@ -12,18 +12,18 @@
     generate_datastore_get_information,
     get_dataset_as_python_object_from_get_info,
 )
+from pydantic import AnyHttpUrl
 
 
 class FileDatastoreGetPayloadFileInfo(_BaseModelCompat):
     """Information required to read a single file stored in `FileDatastore`"""
 
-    # TODO DM-41879: Allowing arbitrary URLs here is a severe security issue,
-    # since it allows the server to trick the client into fetching data from
-    # any file on its local filesystem or from remote storage using credentials
-    # laying around in the environment.  This should be restricted to only
-    # HTTP, but we don't yet have a means of mocking out HTTP gets in tests.
-    url: str
-    """An absolute URL that can be used to read the file"""
+    # This is intentionally restricted to HTTP for security reasons.  Allowing
+    # arbitrary URLs here would allow the server to trick the client into
+    # fetching data from any file on its local filesystem or from remote
+    # storage using credentials laying around in the environment.
+    url: AnyHttpUrl
+    """An HTTP URL that can be used to read the file"""
 
     datastoreRecords: SerializedStoredFileInfo
     """`FileDatastore` metadata records for this file"""
@@ -76,7 +76,7 @@ def get_dataset_as_python_object(
         The retrieved artifact, converted to a Python object
     """
     fileLocations: list[DatasetLocationInformation] = [
-        (Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords))
+        (Location(None, str(file_info.url)), StoredFileInfo.from_simple(file_info.datastoreRecords))
         for file_info in payload.file_info
     ]
 

diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py
@@ -207,13 +207,14 @@ def _read_artifact_into_memory(
     # Do not do this if the size is negative since that indicates
     # we do not know.
     recorded_size = getInfo.info.file_size
-    resource_size = uri.size()
-    if recorded_size >= 0 and resource_size != recorded_size:
-        raise RuntimeError(
-            "Integrity failure in Datastore. "
-            f"Size of file {uri} ({resource_size}) "
-            f"does not match size recorded in registry of {recorded_size}"
-        )
+
+    def check_resource_size(resource_size: int) -> None:
+        if recorded_size >= 0 and resource_size != recorded_size:
+            raise RuntimeError(
+                "Integrity failure in Datastore. "
+                f"Size of file {uri} ({resource_size}) "
+                f"does not match size recorded in registry of {recorded_size}"
+            )
 
     # For the general case we have choices for how to proceed.
     # 1. Always use a local file (downloading the remote resource to a
@@ -225,7 +226,7 @@ def _read_artifact_into_memory(
 
     formatter = getInfo.formatter
     nbytes_max = 10_000_000  # Arbitrary number that we can tune
-    if resource_size <= nbytes_max and formatter.can_read_bytes():
+    if recorded_size >= 0 and recorded_size <= nbytes_max and formatter.can_read_bytes():
         with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
             if cached_file is not None:
                 desired_uri = cached_file
@@ -235,6 +236,7 @@ def _read_artifact_into_memory(
                 msg = ""
             with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
                 serializedDataset = desired_uri.read()
+                check_resource_size(len(serializedDataset))
         log.debug(
             "Deserializing %s from %d bytes from location %s with formatter %s",
             f"component {getInfo.component}" if isComponent else "",
@@ -271,6 +273,7 @@ def _read_artifact_into_memory(
                 location_updated = True
 
             with uri.as_local() as local_uri:
+                check_resource_size(local_uri.size())
                 can_be_cached = False
                 if uri != local_uri:
                     # URI was remote and file was downloaded

diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py
@@ -39,7 +39,7 @@
 import astropy
 from astropy.table import Table as AstropyTable
 
-from .. import Butler, Config, StorageClassFactory, Timespan
+from .. import Butler, Config, DatasetRef, StorageClassFactory, Timespan
 from ..registry import CollectionType
 from ..tests import MetricsExample, addDatasetType
 
@@ -221,6 +221,10 @@ class MetricTestRepo:
         The location of the repository, to pass to ``Butler.makeRepo``.
     configFile : `str`
         The path to the config file, to pass to ``Butler.makeRepo``.
+    forceConfigRoot: `bool`, optional
+        If `False`, any values present in the supplied ``config`` that
+        would normally be reset are not overridden and will appear
+        directly in the output config. Passed to ``Butler.makeRepo``.
     """
 
     METRICS_EXAMPLE_SUMMARY = {"AM1": 5.2, "AM2": 30.6}
@@ -237,9 +241,9 @@ def _makeExampleMetrics() -> MetricsExample:
             [563, 234, 456.7, 752, 8, 9, 27],
         )
 
-    def __init__(self, root: str, configFile: str) -> None:
+    def __init__(self, root: str, configFile: str, forceConfigRoot: bool = True) -> None:
         self.root = root
-        Butler.makeRepo(self.root, config=Config(configFile))
+        Butler.makeRepo(self.root, config=Config(configFile), forceConfigRoot=forceConfigRoot)
         butlerConfigFile = os.path.join(self.root, "butler.yaml")
         self.storageClassFactory = StorageClassFactory()
         self.storageClassFactory.addFromConfig(butlerConfigFile)
@@ -291,7 +295,7 @@ def __init__(self, root: str, configFile: str) -> None:
 
     def addDataset(
         self, dataId: dict[str, Any], run: str | None = None, datasetType: DatasetType | None = None
-    ) -> None:
+    ) -> DatasetRef:
         """Create a new example metric and add it to the named run with the
         given dataId.
 
@@ -309,8 +313,16 @@ def addDataset(
         datasetType : ``DatasetType``, optional
             The dataset type of the added dataset. If `None`, will use the
             default dataset type.
+
+        Returns
+        -------
+        datasetRef : `DatasetRef`
+            A reference to the added dataset.
+
         """
         if run:
             self.butler.registry.registerCollection(run, type=CollectionType.RUN)
         metric = self._makeExampleMetrics()
-        self.butler.put(metric, self.datasetType if datasetType is None else datasetType, dataId, run=run)
+        return self.butler.put(
+            metric, self.datasetType if datasetType is None else datasetType, dataId, run=run
+        )
diff --git a/tests/test_butler.py b/tests/test_butler.py
@@ -48,7 +48,7 @@
 try:
     import boto3
     import botocore
-    from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
+    from lsst.resources.s3utils import clean_test_environment_for_s3
     from moto import mock_s3  # type: ignore[import]
 except ImportError:
     boto3 = None
@@ -114,13 +114,7 @@ def mock_s3(*args: Any, **kwargs: Any) -> Any:  # type: ignore[no-untyped-def]
 
 def clean_environment() -> None:
     """Remove external environment variables that affect the tests."""
-    for k in (
-        "DAF_BUTLER_REPOSITORY_INDEX",
-        "S3_ENDPOINT_URL",
-        "AWS_ACCESS_KEY_ID",
-        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SHARED_CREDENTIALS_FILE",
-    ):
+    for k in ("DAF_BUTLER_REPOSITORY_INDEX",):
         os.environ.pop(k, None)
 
 
@@ -1993,11 +1987,9 @@ def setUp(self) -> None:
         self.bucketName = uri.netloc
 
         # Enable S3 mocking of tests.
+        self.enterContext(clean_test_environment_for_s3())
         self.mock_s3.start()
 
-        # set up some fake credentials if they do not exist
-        self.usingDummyCredentials = setAwsEnvCredentials()
-
         if self.useTempRoot:
             self.root = self.genRoot()
         rooturi = f"s3://{self.bucketName}/{self.root}"
@@ -2035,10 +2027,6 @@ def tearDown(self) -> None:
         # Stop the S3 mock.
         self.mock_s3.stop()
 
-        # unset any potentially set dummy credentials
-        if self.usingDummyCredentials:
-            unsetAwsEnvCredentials()
-
         if self.reg_dir is not None and os.path.exists(self.reg_dir):
             shutil.rmtree(self.reg_dir, ignore_errors=True)
 

diff --git a/tests/test_server.py b/tests/test_server.py
@@ -35,6 +35,8 @@
     from lsst.daf.butler.remote_butler import RemoteButler
     from lsst.daf.butler.remote_butler.server import Factory, app
     from lsst.daf.butler.remote_butler.server._dependencies import factory_dependency
+    from lsst.resources.s3utils import clean_test_environment_for_s3, getS3Client
+    from moto import mock_s3
 except ImportError:
     TestClient = None
     app = None
@@ -68,11 +70,28 @@ class ButlerClientServerTestCase(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        # Set up a mock S3 environment using Moto.  Moto also monkeypatches the
+        # `requests` library so that any HTTP requests to presigned S3 URLs get
+        # redirected to the mocked S3.
+        # Note that all files are stored in memory.
+        cls.enterClassContext(clean_test_environment_for_s3())
+        cls.enterClassContext(mock_s3())
+        bucket_name = "anybucketname"  # matches s3Datastore.yaml
+        getS3Client().create_bucket(Bucket=bucket_name)
+
         cls.storageClassFactory = StorageClassFactory()
 
         # First create a butler and populate it.
         cls.root = makeTestTempDir(TESTDIR)
-        cls.repo = MetricTestRepo(root=cls.root, configFile=os.path.join(TESTDIR, "config/basic/butler.yaml"))
+        cls.repo = MetricTestRepo(
+            root=cls.root,
+            configFile=os.path.join(TESTDIR, "config/basic/butler-s3store.yaml"),
+            forceConfigRoot=False,
+        )
+
+        # Add a file with corrupted data for testing error conditions
+        cls.dataset_with_corrupted_data = _create_corrupted_dataset(cls.repo)
+
         # Override the server's Butler initialization to point at our test repo
         server_butler = Butler.from_config(cls.root, writeable=True)
 
@@ -212,6 +231,9 @@ def test_get(self):
         with self.assertRaises(LookupError):
             self.butler.get(invalid_ref)
 
+        with self.assertRaises(RuntimeError):
+            self.butler.get(self.dataset_with_corrupted_data)
+
         # Test storage class override
         new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
         converted = self.butler.get(ref, storageClass=new_sc)
@@ -220,5 +242,14 @@ def test_get(self):
         self.assertEqual(metric, converted)
 
 
+def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef:
+    run = "corrupted-run"
+    ref = repo.addDataset({"instrument": "DummyCamComp", "visit": 423}, run=run)
+    uris = repo.butler.getURIs(ref)
+    oneOfTheComponents = list(uris.componentURIs.values())[0]
+    oneOfTheComponents.write("corrupted data")
+    return ref
+
+
 if __name__ == "__main__":
     unittest.main()