From 3fa4cf6f34dcc376400f01df69bf168b40692ab2 Mon Sep 17 00:00:00 2001 From: Erin Cochran Date: Wed, 11 Sep 2024 15:31:14 -0400 Subject: [PATCH] [docs-revamp] - Clean up External assets guide (#24402) ## Summary & Motivation ## How I Tested These Changes ## Changelog NOCHANGELOG --------- Co-authored-by: colton --- docs/docs-beta/docs/guides/external-assets.md | 76 ++++++++++--------- .../creating-external-assets.py | 4 +- .../external-assets/dag-of-external-assets.py | 4 + .../external-assets/pulling-with-sensors.py | 12 +-- 4 files changed, 56 insertions(+), 40 deletions(-) diff --git a/docs/docs-beta/docs/guides/external-assets.md b/docs/docs-beta/docs/guides/external-assets.md index 27c306429e681..28cb110f6c33d 100644 --- a/docs/docs-beta/docs/guides/external-assets.md +++ b/docs/docs-beta/docs/guides/external-assets.md @@ -1,67 +1,69 @@ -- title: Representing external data sources with external assets sidebar_position: 80 -sidebar_label: 'External data assets' - +sidebar_label: 'External data sources' --- -One of Dagster's goals is to present a single unified lineage of all of the data assets in an organization. This can include assets orchestrated by Dagster and assets orchestrated by other systems. - -**External assets** enable you to model assets orchestrated by other systems natively within Dagster's Asset catalog, and create new data assets downstream of these external assets. +One of Dagster's goals is to present a single unified lineage of all of the data assets in an organization, even if those assets are orchestrated by systems other than Dagster. -External assets differ from native Dagster assets in that Dagster can't materialize them directly or put them on a schedule. Instead, an external system must inform Dagster of when an external asset is updated. +With **external assets**, you can model assets orchestrated by other systems natively within Dagster, ensuring you have a comprehensive catalog of your organization's data. You can also create new data assets downstream of these external assets. -Examples of external assets could be files in a data lake that are populated by a bespoke internal tool, a CSV file delivered daily by SFTP from a partner, or a table in a data warehouse populated by another orchestrator. +Unlike native assets, Dagster can't materialize external assets directly or put them in a schedule. In these cases, an external system must inform Dagster when an external asset is updated. -## What you'll learn +For example, external assets could be: -- How to create external assets -- How to create assets that depend on external assets -- How to record materializations and metadata -- How to model a DAG of multiple external assets - ---- +- Files in a data lake that are populated by a bespoke internal tool +- A CSV file delivered daily by SFTP from a partner +- A table in a data warehouse populated by another orchestrator
Prerequisites To follow the steps in this guide, you'll need: -- A basic understanding of Dagster and assets. See the [Quick Start](/getting-started/quickstart) tutorial for an overview. +- Familiarity with [Assets](/guides/data-assets) - Familiarity with [Sensors](/guides/sensors)
---- +## Defining external assets -## Creating and depending on external assets +Let's say you have a partner who sends you raw transaction data by SFTP on an almost daily basis. This data is later cleaned and stored in an internal data lake. -Let's imagine that we have a partner that sends us some raw transaction data by SFTP on, roughly, a daily basis, that's later cleaned and stored in an internal data lake. Because the raw transaction data isn't materialized by Dagster, it makes sense to model it as an external asset. +Because the raw transaction data isn't materialized by Dagster, it makes sense to model it as an external asset. The following example accomplishes this by using `AssetSpec`: - + -See the [AssetSpec API docs](/todo) for all the potential parameters you can provide to an external asset. +Refer to the [`AssetSpec` API docs](/todo) for the parameters you can provide to an external asset. ## Recording materializations and metadata -In the preceding example, we modeled the external asset in the asset graph. We also need to inform Dagster whenever an external asset is updated, and include any relevant metadata about the asset. +When an external asset is modeled in Dagster, you also need to inform Dagster whenever the external asset is updated. You should also include any relevant metadata about the asset, such as the time it was last updated. -There are two main ways to do this: "pulling" external assets events with sensors, and "pushing" external asset events using the REST API. +There are two main ways to do this: -### "Pulling" with sensors +- Pulling external assets events with sensors +- Pushing external asset events using Dagster's REST API -You can use a Dagster [sensor](/guides/sensors) to regularly poll the external system and "pull" information about the external asset into Dagster. +### Pulling with sensors -For example, here's how you would poll an external system (like an SFTP server) to update an external asset whenever the file is changed. +You can use a Dagster [sensor](/guides/sensors) to regularly poll the external system and pull information about the external asset into Dagster. - +For example, here's how you would poll an external system like an SFTP server to update an external asset whenever the file is changed. -See the [sensors guide](/guides/sensors) for more information about sensors. + -### "Pushing" with the REST API +Refer to the [Sensors guide](/guides/sensors) for more information about sensors. -You can inform Dagster that an external asset has materialized by "pushing" the event from an external system to the REST API. +### Pushing with the REST API -For example, here's how we would inform Dagster of a materialization of the `raw_transactions` external asset in Dagster+: +You can inform Dagster that an external asset has materialized by pushing the event from an external system to the REST API. The following examples demonstrate how to inform Dagster that a materialization of the `raw_transactions` external asset has occurred. + +The required headers for the REST API depend on whether you're using Dagster+ or OSS. Use the tabs to view an example API request for each Dagster type. + + + + +Authentication headers are required if using Dagster+. The request should made to your Dagster+ organization and a specific deployment in the organization. ```shell curl \ @@ -78,7 +80,10 @@ curl \ }' ``` -If you're using open source, you don't need the authentication headers and should point it at your open source URL (in this example, `http://localhost:3000`): + + + +Authentication headers aren't required if using Dagster OSS. The request should be pointed at your open source URL, which is `http://localhost:3000` in this example. ```shell curl \ @@ -94,10 +99,13 @@ curl \ }' ``` -See the [external assets REST API docs](/todo) for more information. + + + +Refer to the [External assets REST API documentation](/todo) for more information. -## Modeling a DAG of external assets +## Modeling a graph of external assets Like regular Dagster assets, external assets can have dependencies. This is useful when you want to model an entire data pipeline orchestrated by another system. - + diff --git a/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/creating-external-assets.py b/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/creating-external-assets.py index d7d52f1918cb9..f9668dfe91267 100644 --- a/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/creating-external-assets.py +++ b/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/creating-external-assets.py @@ -1,8 +1,9 @@ import dagster as dg -# We define a new external asset with the key "raw_transactions". +# Define an external asset with the key "raw_transactions". # This will appear in the Dagster asset catalog, but cannot # be materialized by Dagster itself. +# highlight-next-line raw_transactions = dg.AssetSpec("raw_transactions") @@ -12,4 +13,5 @@ def cleaned_transactions(): ... +# Define the Definitions object defs = dg.Definitions(assets=[raw_transactions, cleaned_transactions]) diff --git a/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/dag-of-external-assets.py b/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/dag-of-external-assets.py index bfee5bc7024b6..72be481a639c1 100644 --- a/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/dag-of-external-assets.py +++ b/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/dag-of-external-assets.py @@ -1,13 +1,17 @@ import dagster as dg +# highlight-start # Three external assets that depend on each other raw_data = dg.AssetSpec("raw_data") stg_data = dg.AssetSpec("stg_data", deps=[raw_data]) cleaned_data = dg.AssetSpec("cleaned_data", deps=[stg_data]) +# highlight-end +# Native asset that depends on an external asset @dg.asset(deps=[cleaned_data]) def derived_data(): ... +# Define the Definitions object defs = dg.Definitions(assets=[raw_data, stg_data, cleaned_data, derived_data]) diff --git a/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/pulling-with-sensors.py b/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/pulling-with-sensors.py index 07498a00f28c7..b9dbf09ae9e90 100644 --- a/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/pulling-with-sensors.py +++ b/examples/docs_beta_snippets/docs_beta_snippets/guides/data-modeling/external-assets/pulling-with-sensors.py @@ -1,5 +1,6 @@ import dagster as dg +# Define the external asset raw_transactions = dg.AssetSpec("raw_transactions") @@ -7,18 +8,18 @@ def raw_transactions_sensor( context: dg.SensorEvaluationContext, ) -> dg.SensorResult: - # This sensor polls the external system every 30 seconds - # for the last time the file was modified. + # Poll the external system every 30 seconds + # for the last time the file was modified file_last_modified_at_ms = ... - # We can use the cursor to store the last time the sensor updated the asset + # Use the cursor to store the last time the sensor updated the asset if context.cursor is not None: external_asset_last_updated_at_ms = float(context.cursor) else: external_asset_last_updated_at_ms = 0 if file_last_modified_at_ms > external_asset_last_updated_at_ms: - # The external asset has been modified since we last updated it, + # The external asset has been modified since it was last updated, # so record a materialization and update the cursor. return dg.SensorResult( asset_events=[ @@ -31,8 +32,9 @@ def raw_transactions_sensor( cursor=str(file_last_modified_at_ms), ) else: - # Nothing has happened since the last time we checked + # Nothing has happened since the last check return dg.SensorResult() +# Define the Definitions object defs = dg.Definitions(assets=[raw_transactions], sensors=[raw_transactions_sensor])