Skip to content

Commit

Permalink
migrate ArcGIS Online datasets to ingest (#1330)
Browse files Browse the repository at this point in the history
* drop unused file

* drop empty file

* Create esri feature server model in ingest:
* Add default value for crs attribute (we pull geojson format from source)
* Use  models.connectors.esri code to generate arcgis link

* Update downstream code that creates `Config` object:
* `get_filename() fn`
* `get_version()` for `ingest.ESRIFeatureServer`

* Download data from arcgis

* Add tests for: `get_version()`, `get_filename()`, `download_file_from_source()`

* Add arcgis test template

---------

Co-authored-by: sf-dcp <[email protected]>
  • Loading branch information
damonmcc and sf-dcp authored Feb 26, 2025
1 parent bf66dde commit 2f0a75a
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 20 deletions.
7 changes: 7 additions & 0 deletions dcpy/connectors/esri/arcgis_feature_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
TimeRemainingColumn,
)
import yaml
import json

from dcpy.models.connectors.esri import FeatureServer, FeatureServerLayer
import dcpy.models.product.dataset.metadata as models
Expand Down Expand Up @@ -184,6 +185,12 @@ def _downcase_properties_keys(feat):
return {"type": "FeatureCollection", "crs": crs, "features": features}


def download_layer(layer: FeatureServerLayer, crs: str, path: Path) -> None:
geojson = get_layer(layer, crs=int(crs.strip("EPSG:")))
with open(path, "w") as f:
json.dump(geojson, f)


def make_dcp_metadata(layer_url: str) -> models.Metadata:
if layer_url.endswith("FeatureServer/0"):
layer_url = layer_url + "?f=pjson"
Expand Down
8 changes: 8 additions & 0 deletions dcpy/lifecycle/ingest/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
S3Source,
ScriptSource,
DEPublished,
ESRIFeatureServer,
Source,
ProcessingStep,
Template,
Expand All @@ -23,6 +24,7 @@
from dcpy.utils import metadata
from dcpy.utils.logging import logger
from dcpy.connectors.socrata import extract as extract_socrata
from dcpy.connectors.esri import arcgis_feature_service
from dcpy.connectors.edm import publishing

TEMPLATE_DIR = Path(__file__).parent / "templates"
Expand Down Expand Up @@ -75,6 +77,10 @@ def get_version(source: Source, timestamp: datetime | None = None) -> str:
"Unable to determine latest version. If archiving known version, please provide it."
)
return version
case ESRIFeatureServer():
return arcgis_feature_service.get_data_last_updated(
source.feature_server_layer
).strftime("%Y%m%d")
case _:
if timestamp is None:
raise TypeError(
Expand Down Expand Up @@ -102,6 +108,8 @@ def get_filename(source: Source, ds_id: str) -> str:
return f"{ds_id}.{source.extension}"
case S3Source():
return Path(source.key).name
case ESRIFeatureServer():
return f"{ds_id}.json"
case _:
raise NotImplementedError(
f"Source type {source} not supported for get_filename"
Expand Down
8 changes: 8 additions & 0 deletions dcpy/lifecycle/ingest/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
S3Source,
ScriptSource,
DEPublished,
ESRIFeatureServer,
Source,
)
from dcpy.models.connectors import socrata, web as web_models
from dcpy.models.connectors.edm.publishing import GisDataset
from dcpy.utils import s3
from dcpy.connectors.edm import publishing
from dcpy.connectors.socrata import extract as extract_socrata
from dcpy.connectors.esri import arcgis_feature_service
from dcpy.connectors import web


Expand Down Expand Up @@ -48,6 +50,12 @@ def download_file_from_source(
web.download_file(source.endpoint, path)
case socrata.Source():
extract_socrata.download(source, path)
case ESRIFeatureServer():
arcgis_feature_service.download_layer(
source.feature_server_layer,
source.crs,
path,
)
case _:
raise NotImplementedError(
f"Source type {source.type} not supported for download_file_from_source"
Expand Down
Empty file removed dcpy/models/connectors/doe.py
Empty file.
28 changes: 27 additions & 1 deletion dcpy/models/lifecycle/ingest.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from __future__ import annotations
from functools import cached_property
from datetime import datetime
from pathlib import Path
from pydantic import BaseModel, Field, AliasChoices
from typing import Any, Literal, TypeAlias

from dcpy.utils.metadata import RunDetails
from dcpy.models.connectors.edm import recipes, publishing
from dcpy.models.connectors import web, socrata
from dcpy.models.connectors import web, socrata, esri
from dcpy.models import file
from dcpy.models.base import SortedSerializedBase
from dcpy.models.dataset import Column as BaseColumn, COLUMN_TYPES

from dcpy.connectors.esri import arcgis_feature_service


class LocalFileSource(BaseModel, extra="forbid"):
type: Literal["local_file"]
Expand All @@ -35,6 +38,28 @@ class DEPublished(BaseModel, extra="forbid"):
filename: str


class ESRIFeatureServer(BaseModel, extra="forbid"):
type: Literal["esri"]
server: esri.Server
dataset: str
layer_name: str | None = None
layer_id: int | None = None
crs: str = "EPSG:4326" # The default value here is geojson specification

@property
def feature_server(self) -> esri.FeatureServer:
return esri.FeatureServer(server=self.server, name=self.dataset)

@cached_property
def feature_server_layer(self) -> esri.FeatureServerLayer:
feature_server_layer = arcgis_feature_service.resolve_layer(
feature_server=self.feature_server,
layer_name=self.layer_name,
layer_id=self.layer_id,
)
return feature_server_layer


Source: TypeAlias = (
LocalFileSource
| web.FileDownloadSource
Expand All @@ -44,6 +69,7 @@ class DEPublished(BaseModel, extra="forbid"):
| DEPublished
| S3Source
| ScriptSource
| ESRIFeatureServer
)


Expand Down
20 changes: 20 additions & 0 deletions dcpy/test/connectors/test_esri.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,23 @@ def test_get_layer_chunked(self, post: MagicMock):

# one call to get ids, three calls to get data
assert post.call_count == 4


@patch("requests.get", side_effect=mock_request_get)
@patch("requests.post", side_effect=mock_query_layer)
def test_download_layer(get, post, create_temp_filesystem):
filename = "does_not_exist.geojson"
dataset = "National_Register_Building_Listings"
layer = FeatureServerLayer(
server=Server.nys_parks,
name=dataset,
layer_name=LAYER_NAME,
layer_id=LAYER_ID,
)
arcfs.download_layer(
layer=layer,
crs="EPSG:3857",
path=create_temp_filesystem / filename,
)
print(create_temp_filesystem / filename)
assert (create_temp_filesystem / filename).exists()
18 changes: 0 additions & 18 deletions dcpy/test/lifecycle/ingest/resources/sources.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
id: nysparks_historicplaces
acl: public-read

attributes:
name: National Register of Historic Places

ingestion:
source:
type: esri
server: nys_parks
dataset: National_Register_Building_Listings
layer_id: 13
file_format:
type: geojson
10 changes: 9 additions & 1 deletion dcpy/test/lifecycle/ingest/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dcpy.models.connectors.edm.publishing import GisDataset
from dcpy.models.connectors.edm.recipes import Dataset
from dcpy.models import file, library
from dcpy.models.connectors import socrata, web
from dcpy.models.connectors import socrata, web, esri
from dcpy.models.lifecycle.ingest import (
LocalFileSource,
ScriptSource,
Expand All @@ -14,6 +14,7 @@
ArchivalMetadata,
Ingestion,
Config,
ESRIFeatureServer,
)
from dcpy.utils.metadata import get_run_details
from dcpy.test.conftest import RECIPES_BUCKET
Expand Down Expand Up @@ -47,6 +48,12 @@ class Sources:
de_publish = DEPublished(
type="de-published", product=TEST_DATASET_NAME, filename="file.csv"
)
esri = ESRIFeatureServer(
type="esri",
server=esri.Server.nys_parks,
dataset="National_Register_Building_Listings",
layer_id=13,
)


BASIC_CONFIG = Config(
Expand Down Expand Up @@ -83,4 +90,5 @@ class Sources:
(Sources.api, f"{TEST_DATASET_NAME}.json"),
(Sources.socrata, f"{TEST_DATASET_NAME}.csv"),
(Sources.s3, "test.txt"),
(Sources.esri, f"{TEST_DATASET_NAME}.json"),
]
6 changes: 6 additions & 0 deletions dcpy/test/lifecycle/ingest/test_configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ def test_socrata(self, get):
### based on mocked response in dcpy/test/conftest.py
assert configure.get_version(source) == "20240412"

@mock.patch("requests.get", side_effect=mock_request_get)
def test_esri(self, get):
source = Sources.esri
### based on mocked response in dcpy/test/conftest.py
configure.get_version(source) == "20240806"

def test_gis_dataset(self, create_buckets):
datestring = "20240412"
s3.client().put_object(
Expand Down

0 comments on commit 2f0a75a

Please sign in to comment.