From 6dca8c274e7e35b1539b308574517a82a86df011 Mon Sep 17 00:00:00 2001 From: Yuhan Luo <4531914+yuhan@users.noreply.github.com> Date: Wed, 11 Sep 2024 11:16:34 -0700 Subject: [PATCH] integration docs beta 1/ replicate all integration pages from mkt site to beta docs (#24330) ## Summary & Motivation copy over from https://dagster.io/integrations (note: this content is up-to-date per PR stack https://github.com/dagster-io/dagster-website/pull/1280 few weeks ago) this PR made the following changes: 1. update title to "Dagster & " 2. add `sidebar_label: " so it won't show a wall of "Dagster &" on left nav 3. fix all vale errors and warnings, including a lot of vale accept additions 4. rename files from `dagster-.mdx` to just `.md` Next steps in later stack: - move code to python files - improve navigation: Index page and/or left nav to bucket integrations into categories, differentiate community owned Later steps: - improve doc content page one by one (e.g. template guides, reuse the good ones from current docs site) **Open discussion:** Figure out the relationship between docs and marketing site regarding integrations. * Option 1: no dagster.io/integrations and redirect that to docs.dagster.io/integrations * Yuhan's pick: I'm actually leaning towards this to completely consolidate all integration contents into the docs site for simplicity and ease of navigation so there won't be two similar content on two different sites, but I'd need to consult the SEO implication in this option. * Option 2: keep both dagster.io/integrations and docs.dagster.io/integrations; no code in marketing site, only for SEO purpose; docs pages focus on more technical guides/references. ## How I Tested These Changes **see in preview: https://dagster-docs-beta-211qncb7r-elementl.vercel.app/integrations** ## Changelog `NOCHANGELOG` --------- Co-authored-by: colton --- .../docs/getting-started/installation.md | 6 +- .../docs/guides/asset-dependencies.md | 2 +- docs/docs-beta/docs/integrations.md | 3 +- docs/docs-beta/docs/integrations/airbyte.md | 52 +++++++ .../docs-beta/docs/integrations/aws-athena.md | 48 ++++++ .../docs/integrations/aws-cloudwatch.md | 63 ++++++++ docs/docs-beta/docs/integrations/aws-ecr.md | 58 ++++++++ docs/docs-beta/docs/integrations/aws-emr.md | 96 ++++++++++++ docs/docs-beta/docs/integrations/aws-glue.md | 71 +++++++++ .../docs-beta/docs/integrations/aws-lambda.md | 62 ++++++++ .../docs/integrations/aws-redshift.md | 58 ++++++++ docs/docs-beta/docs/integrations/aws-s3.md | 66 +++++++++ .../docs/integrations/aws-secretsmanager.md | 67 +++++++++ docs/docs-beta/docs/integrations/aws-ssm.md | 70 +++++++++ .../docs/integrations/azure-adls2.md | 64 ++++++++ docs/docs-beta/docs/integrations/census.md | 49 +++++++ docs/docs-beta/docs/integrations/cube.md | 61 ++++++++ .../docs-beta/docs/integrations/databricks.md | 138 ++++++++++++++++++ docs/docs-beta/docs/integrations/datadog.md | 60 ++++++++ docs/docs-beta/docs/integrations/dbt-cloud.md | 51 +++++++ docs/docs-beta/docs/integrations/dbt.md | 87 +++++++++++ docs/docs-beta/docs/integrations/deltalake.md | 43 ++++++ docs/docs-beta/docs/integrations/dlt.md | 66 +++++++++ docs/docs-beta/docs/integrations/docker.md | 67 +++++++++ docs/docs-beta/docs/integrations/duckdb.md | 50 +++++++ docs/docs-beta/docs/integrations/fivetran.md | 45 ++++++ .../docs/integrations/gcp-bigquery.md | 49 +++++++ .../docs/integrations/gcp-dataproc.md | 62 ++++++++ docs/docs-beta/docs/integrations/gcp-gcs.md | 59 ++++++++ docs/docs-beta/docs/integrations/github.md | 60 ++++++++ docs/docs-beta/docs/integrations/hashicorp.md | 58 ++++++++ docs/docs-beta/docs/integrations/hightouch.md | 58 ++++++++ docs/docs-beta/docs/integrations/jupyter.md | 22 +++ .../docs-beta/docs/integrations/kubernetes.md | 61 ++++++++ docs/docs-beta/docs/integrations/lakefs.md | 77 ++++++++++ docs/docs-beta/docs/integrations/looker.md | 46 ++++++ docs/docs-beta/docs/integrations/meltano.md | 50 +++++++ .../docs/integrations/microsoft-teams.md | 52 +++++++ .../docs/integrations/open-metadata.md | 28 ++++ docs/docs-beta/docs/integrations/openai.md | 67 +++++++++ docs/docs-beta/docs/integrations/pagerduty.md | 57 ++++++++ docs/docs-beta/docs/integrations/pandas.md | 31 ++++ docs/docs-beta/docs/integrations/pandera.md | 69 +++++++++ .../docs-beta/docs/integrations/prometheus.md | 54 +++++++ docs/docs-beta/docs/integrations/sdf.md | 77 ++++++++++ docs/docs-beta/docs/integrations/secoda.md | 28 ++++ docs/docs-beta/docs/integrations/shell.md | 56 +++++++ docs/docs-beta/docs/integrations/slack.md | 52 +++++++ docs/docs-beta/docs/integrations/sling.md | 94 ++++++++++++ docs/docs-beta/docs/integrations/snowflake.md | 60 ++++++++ docs/docs-beta/docs/integrations/spark.md | 28 ++++ docs/docs-beta/docs/integrations/ssh-sftp.md | 50 +++++++ docs/docs-beta/docs/integrations/twilio.md | 57 ++++++++ docs/docs-beta/docs/integrations/wandb.md | 46 ++++++ docs/docs-beta/sidebars.ts | 17 ++- .../config/vocabularies/Dagster/accept.txt | 46 +++++- 56 files changed, 3061 insertions(+), 13 deletions(-) create mode 100644 docs/docs-beta/docs/integrations/airbyte.md create mode 100644 docs/docs-beta/docs/integrations/aws-athena.md create mode 100644 docs/docs-beta/docs/integrations/aws-cloudwatch.md create mode 100644 docs/docs-beta/docs/integrations/aws-ecr.md create mode 100644 docs/docs-beta/docs/integrations/aws-emr.md create mode 100644 docs/docs-beta/docs/integrations/aws-glue.md create mode 100644 docs/docs-beta/docs/integrations/aws-lambda.md create mode 100644 docs/docs-beta/docs/integrations/aws-redshift.md create mode 100644 docs/docs-beta/docs/integrations/aws-s3.md create mode 100644 docs/docs-beta/docs/integrations/aws-secretsmanager.md create mode 100644 docs/docs-beta/docs/integrations/aws-ssm.md create mode 100644 docs/docs-beta/docs/integrations/azure-adls2.md create mode 100644 docs/docs-beta/docs/integrations/census.md create mode 100644 docs/docs-beta/docs/integrations/cube.md create mode 100644 docs/docs-beta/docs/integrations/databricks.md create mode 100644 docs/docs-beta/docs/integrations/datadog.md create mode 100644 docs/docs-beta/docs/integrations/dbt-cloud.md create mode 100644 docs/docs-beta/docs/integrations/dbt.md create mode 100644 docs/docs-beta/docs/integrations/deltalake.md create mode 100644 docs/docs-beta/docs/integrations/dlt.md create mode 100644 docs/docs-beta/docs/integrations/docker.md create mode 100644 docs/docs-beta/docs/integrations/duckdb.md create mode 100644 docs/docs-beta/docs/integrations/fivetran.md create mode 100644 docs/docs-beta/docs/integrations/gcp-bigquery.md create mode 100644 docs/docs-beta/docs/integrations/gcp-dataproc.md create mode 100644 docs/docs-beta/docs/integrations/gcp-gcs.md create mode 100644 docs/docs-beta/docs/integrations/github.md create mode 100644 docs/docs-beta/docs/integrations/hashicorp.md create mode 100644 docs/docs-beta/docs/integrations/hightouch.md create mode 100644 docs/docs-beta/docs/integrations/jupyter.md create mode 100644 docs/docs-beta/docs/integrations/kubernetes.md create mode 100644 docs/docs-beta/docs/integrations/lakefs.md create mode 100644 docs/docs-beta/docs/integrations/looker.md create mode 100644 docs/docs-beta/docs/integrations/meltano.md create mode 100644 docs/docs-beta/docs/integrations/microsoft-teams.md create mode 100644 docs/docs-beta/docs/integrations/open-metadata.md create mode 100644 docs/docs-beta/docs/integrations/openai.md create mode 100644 docs/docs-beta/docs/integrations/pagerduty.md create mode 100644 docs/docs-beta/docs/integrations/pandas.md create mode 100644 docs/docs-beta/docs/integrations/pandera.md create mode 100644 docs/docs-beta/docs/integrations/prometheus.md create mode 100644 docs/docs-beta/docs/integrations/sdf.md create mode 100644 docs/docs-beta/docs/integrations/secoda.md create mode 100644 docs/docs-beta/docs/integrations/shell.md create mode 100644 docs/docs-beta/docs/integrations/slack.md create mode 100644 docs/docs-beta/docs/integrations/sling.md create mode 100644 docs/docs-beta/docs/integrations/snowflake.md create mode 100644 docs/docs-beta/docs/integrations/spark.md create mode 100644 docs/docs-beta/docs/integrations/ssh-sftp.md create mode 100644 docs/docs-beta/docs/integrations/twilio.md create mode 100644 docs/docs-beta/docs/integrations/wandb.md diff --git a/docs/docs-beta/docs/getting-started/installation.md b/docs/docs-beta/docs/getting-started/installation.md index 3b00b5d04351f..b8c7d5449b6f3 100644 --- a/docs/docs-beta/docs/getting-started/installation.md +++ b/docs/docs-beta/docs/getting-started/installation.md @@ -1,8 +1,8 @@ --- -title: "Installing Dagster" -description: "Learn how to install Dagster" +title: Installing Dagster +description: Learn how to install Dagster sidebar_position: 20 -sidebar_label: "Installation" +sidebar_label: Installation --- # Installing Dagster diff --git a/docs/docs-beta/docs/guides/asset-dependencies.md b/docs/docs-beta/docs/guides/asset-dependencies.md index 7e0f0c97c77c2..421a61a08fece 100644 --- a/docs/docs-beta/docs/guides/asset-dependencies.md +++ b/docs/docs-beta/docs/guides/asset-dependencies.md @@ -92,7 +92,7 @@ Consider this example: -This example downloads a zip file from Google Drive, unzips it, and loads the data into a pandas DataFrame. It relies on each asset running on the same file system to perform these operations. +This example downloads a zip file from Google Drive, unzips it, and loads the data into a Pandas DataFrame. It relies on each asset running on the same file system to perform these operations. The assets are modeled as tasks, rather than as data assets. For more information on the difference between tasks and data assets, check out the [Thinking in Assets](/concepts/assets/thinking-in-assets) guide. diff --git a/docs/docs-beta/docs/integrations.md b/docs/docs-beta/docs/integrations.md index 93781a0437102..881d845d26c02 100644 --- a/docs/docs-beta/docs/integrations.md +++ b/docs/docs-beta/docs/integrations.md @@ -1,5 +1,6 @@ --- -title: "Integrations" +title: 'Integrations' +displayed_sidebar: 'integrations' --- # Integrations diff --git a/docs/docs-beta/docs/integrations/airbyte.md b/docs/docs-beta/docs/integrations/airbyte.md new file mode 100644 index 0000000000000..6a65ac2cceb8d --- /dev/null +++ b/docs/docs-beta/docs/integrations/airbyte.md @@ -0,0 +1,52 @@ +--- +layout: Integration +status: published +name: Airbyte +title: Dagster & Airbyte +sidebar_label: Airbyte +excerpt: Orchestrate Airbyte connections and schedule syncs alongside upstream or downstream dependencies. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-airbyte +docslink: https://docs.dagster.io/integrations/airbyte +partnerlink: https://airbyte.com/tutorials/orchestrate-data-ingestion-and-transformation-pipelines +logo: /integrations/airbyte.svg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +Using this integration, you can trigger Airbyte syncs and orchestrate your Airbyte connections from within Dagster, making it easy to chain an Airbyte sync with upstream or downstream steps in your workflow. + +### Installation + +```bash +pip install dagster-airbyte +``` + +### Example + +```python +from dagster import EnvVar +from dagster_airbyte import AirbyteResource, load_assets_from_airbyte_instance +import os + +# Connect to your OSS Airbyte instance +airbyte_instance = AirbyteResource( + host="localhost", + port="8000", + # If using basic auth, include username and password: + username="airbyte", + password=EnvVar("AIRBYTE_PASSWORD") +) + +# Load all assets from your Airbyte instance +airbyte_assets = load_assets_from_airbyte_instance(airbyte_instance) + +``` + +### About Airbyte + +**Airbyte** is an open source data integration engine that helps you consolidate your SaaS application and database data into your data warehouses, lakes and databases. diff --git a/docs/docs-beta/docs/integrations/aws-athena.md b/docs/docs-beta/docs/integrations/aws-athena.md new file mode 100644 index 0000000000000..94fd8766e4eb1 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-athena.md @@ -0,0 +1,48 @@ +--- +layout: Integration +status: published +name: AWS Athena +title: Dagster & AWS Athena +sidebar_label: AWS Athena +excerpt: This integration allows you to connect to AWS Athena and analyze data in Amazon S3 using standard SQL within your Dagster pipelines. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-athena.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to connect to AWS Athena, a serverless interactive query service that makes it easy to analyze data in Amazon S3 using standard SQL. Using this integration, you can issue queries to Athena, fetch results, and handle query execution states within your Dagster pipelines. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +from dagster import Definitions, asset +from dagster_aws.athena import AthenaClientResource + + +@asset +def example_athena_asset(athena: AthenaClientResource): + return athena.get_client().execute_query("SELECT 1", fetch_results=True) + + +defs = Definitions( + assets=[example_athena_asset], resources={"athena": AthenaClientResource()} +) +``` + +### About AWS Athena + +AWS Athena is a serverless, interactive query service that allows you to analyze data directly in Amazon S3 using standard SQL. Athena is easy to use; point to your data in Amazon S3, define the schema, and start querying using standard SQL. Most results are delivered within seconds. With Athena, there are no infrastructure setups, and you pay only for the queries you run. It scales automatically—executing queries in parallel—so results are fast, even with large datasets and complex queries. diff --git a/docs/docs-beta/docs/integrations/aws-cloudwatch.md b/docs/docs-beta/docs/integrations/aws-cloudwatch.md new file mode 100644 index 0000000000000..0fd7e78cf877a --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-cloudwatch.md @@ -0,0 +1,63 @@ +--- +layout: Integration +status: published +name: AWS CloudWatch +title: Dagster & AWS CloudWatch +sidebar_label: AWS CloudWatch +excerpt: This integration allows you to send Dagster logs to AWS CloudWatch, enabling centralized logging and monitoring of your Dagster jobs. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-cloudwatch.svg +categories: + - Monitoring +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to send Dagster logs to AWS CloudWatch, enabling centralized logging and monitoring of your Dagster jobs. By using AWS CloudWatch, you can take advantage of its powerful log management features, such as real-time log monitoring, log retention policies, and alerting capabilities. + +Using this integration, you can configure your Dagster jobs to log directly to AWS CloudWatch, making it easier to track and debug your workflows. This is particularly useful for production environments where centralized logging is essential for maintaining observability and operational efficiency. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +import dagster as dg +from dagster_aws.cloudwatch import cloudwatch_logger + + +@dg.asset +def my_asset(context: dg.AssetExecutionContext): + context.log.info("Hello, CloudWatch!") + context.log.error("This is an error") + context.log.debug("This is a debug message") + + +defs = dg.Definitions( + assets=[my_asset], + loggers={ + "cloudwatch_logger": cloudwatch_logger, + }, +) +``` + +### About AWS CloudWatch + +AWS CloudWatch is a monitoring and observability service provided by Amazon Web Services (AWS). It allows you to collect, access, and analyze performance and operational data from a variety of AWS resources, applications, and services. With AWS CloudWatch, you can set up alarms, visualize logs and metrics, and gain insights into your infrastructure and applications to ensure they're running smoothly. + +AWS CloudWatch provides features such as: + +- Real-time monitoring: Track the performance of your applications and infrastructure in real-time. +- Log management: Collect, store, and analyze log data from various sources. +- Alarms and notifications: Set up alarms to automatically notify you of potential issues. +- Dashboards: Create custom dashboards to visualize metrics and logs. +- Integration with other AWS services: Seamlessly integrate with other AWS services for a comprehensive monitoring solution. diff --git a/docs/docs-beta/docs/integrations/aws-ecr.md b/docs/docs-beta/docs/integrations/aws-ecr.md new file mode 100644 index 0000000000000..9b242d1d41fb7 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-ecr.md @@ -0,0 +1,58 @@ +--- +layout: Integration +status: published +name: AWS ECR +title: Dagster & AWS ECR +sidebar_label: AWS ECR +excerpt: This integration allows you to connect to AWS Elastic Container Registry (ECR), enabling you to manage your container images more effectively in your Dagster pipelines. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-ecr.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to connect to AWS Elastic Container Registry (ECR). It provides resources to interact with AWS ECR, enabling you to manage your container images. + +Using this integration, you can seamlessly integrate AWS ECR into your Dagster pipelines, making it easier to manage and deploy containerized applications. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +from dagster import asset, Definitions +from dagster_aws.ecr import ECRPublicResource + + +@asset +def get_ecr_login_password(ecr_public: ECRPublicResource): + return ecr_public.get_client().get_login_password() + + +defs = Definitions( + assets=[get_ecr_login_password], + resources={ + "ecr_public": ECRPublicResource( + region_name="us-west-1", + aws_access_key_id="your_access_key_id", + aws_secret_access_key="your_secret_access_key", + aws_session_token="your_session_token", + ) + }, +) +``` + +### About AWS ECR + +AWS Elastic Container Registry (ECR) is a fully managed Docker container registry that makes it easy for developers to store, manage, and deploy Docker container images. AWS ECR is integrated with Amazon Elastic Kubernetes Service (EKS), simplifying your development to production workflow. With ECR, you can securely store and manage your container images and easily integrate with your existing CI/CD pipelines. AWS ECR provides high availability and scalability, ensuring that your container images are always available when you need them. diff --git a/docs/docs-beta/docs/integrations/aws-emr.md b/docs/docs-beta/docs/integrations/aws-emr.md new file mode 100644 index 0000000000000..e1b5ec0763010 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-emr.md @@ -0,0 +1,96 @@ +--- +layout: Integration +status: published +name: AWS EMR +title: Dagster & AWS EMR +sidebar_label: AWS EMR +excerpt: The AWS EMR integration allows you to seamlessly integrate AWS EMR into your Dagster pipelines for petabyte-scale data processing using open source tools like Apache Spark, Hive, Presto, and more. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-emr.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +The `dagster-aws` integration provides ways orchestrating data pipelines that leverage AWS services, including AWS EMR (Elastic MapReduce). This integration allows you to run and scale big data workloads using open source tools such as Apache Spark, Hive, Presto, and more. + +Using this integration, you can: + +- Seamlessly integrate AWS EMR into your Dagster pipelines. +- Utilize EMR for petabyte-scale data processing. +- Easily manage and monitor EMR clusters and jobs from within Dagster. +- Leverage Dagster's orchestration capabilities to handle complex data workflows involving EMR. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +from pathlib import Path +from typing import Any + +from dagster import Definitions, ResourceParam, asset +from dagster_aws.emr import emr_pyspark_step_launcher +from dagster_aws.s3 import S3Resource +from dagster_pyspark import PySparkResource +from pyspark.sql import DataFrame, Row +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + + +emr_pyspark = PySparkResource(spark_config={"spark.executor.memory": "2g"}) + + +@asset +def people( + pyspark: PySparkResource, pyspark_step_launcher: ResourceParam[Any] +) -> DataFrame: + schema = StructType( + [StructField("name", StringType()), StructField("age", IntegerType())] + ) + rows = [ + Row(name="Thom", age=51), + Row(name="Jonny", age=48), + Row(name="Nigel", age=49), + ] + return pyspark.spark_session.createDataFrame(rows, schema) + + +@asset +def people_over_50( + pyspark_step_launcher: ResourceParam[Any], people: DataFrame +) -> DataFrame: + return people.filter(people["age"] > 50) + + +defs = Definitions( + assets=[people, people_over_50], + resources={ + "pyspark_step_launcher": emr_pyspark_step_launcher.configured( + { + "cluster_id": {"env": "EMR_CLUSTER_ID"}, + "local_pipeline_package_path": str(Path(__file__).parent), + "deploy_local_pipeline_package": True, + "region_name": "us-west-1", + "staging_bucket": "my_staging_bucket", + "wait_for_logs": True, + } + ), + "pyspark": emr_pyspark, + "s3": S3Resource(), + }, +) +``` + +### About AWS EMR + +**AWS EMR** (Elastic MapReduce) is a cloud big data platform for processing vast amounts of data using open source tools such as Apache Spark, Apache Hive, Apache HBase, Apache Flink, Apache Hudi, and Presto. It simplifies running big data frameworks, allowing you to process and analyze large datasets quickly and cost-effectively. AWS EMR provides the scalability, flexibility, and reliability needed to handle complex data processing tasks, making it an ideal choice for data engineers and scientists. diff --git a/docs/docs-beta/docs/integrations/aws-glue.md b/docs/docs-beta/docs/integrations/aws-glue.md new file mode 100644 index 0000000000000..227dc28ca9bc7 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-glue.md @@ -0,0 +1,71 @@ +--- +layout: Integration +status: published +name: AWS Glue +title: Dagster & AWS Glue +sidebar_label: AWS Glue +excerpt: The AWS Glue integration enables you to initiate AWS Glue jobs directly from Dagster, seamlessly pass parameters to your code, and stream logs and structured messages back into Dagster. +date: 2024-08-20 +apireflink: https://docs.dagster.io/concepts/dagster-pipes/aws-glue +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-glue.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +The `dagster-aws` integration library provides the `PipesGlueClient` resource, enabling you to launch AWS Glue jobs directly from Dagster assets and ops. This integration allows you to pass parameters to Glue code while Dagster receives real-time events, such as logs, asset checks, and asset materializations, from the initiated jobs. With minimal code changes required on the job side, this integration is both efficient and easy to implement. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +import boto3 +from dagster import AssetExecutionContext, Definitions, asset +from dagster_aws.pipes import ( + PipesGlueClient, + PipesS3ContextInjector, + PipesS3MessageReader, +) + + +@asset +def glue_pipes_asset( + context: AssetExecutionContext, pipes_glue_client: PipesGlueClient +): + return pipes_glue_client.run( + context=context, + job_name="Example Job", + arguments={"some_parameter_value": "1"}, + ).get_materialize_result() + + +defs = Definitions( + assets=[glue_pipes_asset], + resources={ + "pipes_glue_client": PipesGlueClient( + client=boto3.client("glue"), + context_injector=PipesS3ContextInjector( + client=boto3.client("s3"), + bucket="my-bucket", + ), + message_reader=PipesS3MessageReader( + client=boto3.client("s3"), bucket="my-bucket" + ), + ) + }, +) +``` + +### About AWS Glue + +**AWS Glue** is a fully managed cloud service designed to simplify and automate the process of discovering, preparing, and integrating data for analytics, machine learning, and application development. It supports a wide range of data sources and formats, offering seamless integration with other AWS services. AWS Glue provides the tools to create, run, and manage ETL (Extract, Transform, Load) jobs, making it easier to handle complex data workflows. Its serverless architecture allows for scalability and flexibility, making it a preferred choice for data engineers and analysts who need to process and prepare data efficiently. diff --git a/docs/docs-beta/docs/integrations/aws-lambda.md b/docs/docs-beta/docs/integrations/aws-lambda.md new file mode 100644 index 0000000000000..70b1d558420dd --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-lambda.md @@ -0,0 +1,62 @@ +--- +layout: Integration +status: published +name: AWS Lambda +title: Dagster & AWS Lambda +sidebar_label: AWS Lambda +excerpt: Using the AWS Lambda integration with Dagster, you can leverage serverless functions to execute external code in your pipelines. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-lambda.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +Using this integration, you can leverage AWS Lambda to execute external code as part of your Dagster pipelines. This is particularly useful for running serverless functions that can scale automatically and handle various workloads without the need for managing infrastructure. The `PipesLambdaClient` class allows you to invoke AWS Lambda functions and stream logs and structured metadata back to Dagster's UI and tools. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +import boto3 + +from dagster import asset, AssetExecutionContext, Definitions +from dagster_aws.pipes import PipesLambdaClient + + +lambda_client = boto3.client("lambda", region_name="us-west-1") + +lambda_pipes_client = PipesLambdaClient(client=lambda_client) + + +@asset +def lambda_pipes_asset( + context: AssetExecutionContext, lambda_pipes_client: PipesLambdaClient +): + return lambda_pipes_client.run( + context=context, + function_name="your_lambda_function_name", + event={"key": "value"}, + ).get_materialize_result() + + +defs = Definitions( + assets=[lambda_pipes_asset], + resources={"lambda_pipes_client": lambda_pipes_client}, +) +``` + +### About AWS Lambda + +**AWS Lambda** is a serverless compute service provided by Amazon Web Services (AWS). It allows you to run code without provisioning or managing servers. AWS Lambda automatically scales your application by running code in response to each trigger, such as changes to data in an Amazon S3 bucket or an update to a DynamoDB table. You can use AWS Lambda to extend other AWS services with custom logic, or create your own backend services that operate at AWS scale, performance, and security. diff --git a/docs/docs-beta/docs/integrations/aws-redshift.md b/docs/docs-beta/docs/integrations/aws-redshift.md new file mode 100644 index 0000000000000..9cb47a5270cc0 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-redshift.md @@ -0,0 +1,58 @@ +--- +layout: Integration +status: published +name: AWS Redshift +title: Dagster & AWS Redshift +sidebar_label: AWS Redshift +excerpt: "Using this integration, you can seamlessly integrate AWS Redshift into your Dagster workflows, leveraging Redshifts data warehousing capabilities for your data pipelines." +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-redshift.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +Using this integration, you can connect to an AWS Redshift cluster and issue queries against it directly from your Dagster assets. This allows you to seamlessly integrate Redshift into your data pipelines, leveraging the power of Redshift's data warehousing capabilities within your Dagster workflows. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +from dagster import Definitions, asset, EnvVar +from dagster_aws.redshift import RedshiftClientResource + + +@asset +def example_redshift_asset(context, redshift: RedshiftClientResource): + result = redshift.get_client().execute_query("SELECT 1", fetch_results=True) + context.log.info(f"Query result: {result}") + + +redshift_configured = RedshiftClientResource( + host="my-redshift-cluster.us-east-1.redshift.amazonaws.com", + port=5439, + user="dagster", + password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"), + database="dev", +) + +defs = Definitions( + assets=[example_redshift_asset], + resources={"redshift": redshift_configured}, +) +``` + +### About AWS Redshift + +**AWS Redshift** is a fully managed, petabyte-scale data warehouse service in the cloud. You can start with just a few hundred gigabytes of data and scale to a petabyte or more. This enables you to use your data to acquire new insights for your business and customers. Redshift offers fast query performance using SQL-based tools and business intelligence applications, making it a powerful tool for data warehousing and analytics. diff --git a/docs/docs-beta/docs/integrations/aws-s3.md b/docs/docs-beta/docs/integrations/aws-s3.md new file mode 100644 index 0000000000000..87138d584e462 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-s3.md @@ -0,0 +1,66 @@ +--- +layout: Integration +status: published +name: AWS S3 +title: Dagster & AWS S3 +sidebar_label: AWS S3 +excerpt: The AWS S3 integration allows data engineers to easily read and write objects to the durable AWS S3 storage, enabling engineers to have a resilient storage layer when constructing their pipelines. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-s3.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +The AWS S3 integration allows data engineers to easily read, and write objects to the durable AWS S3 storage -- enabling engineers to a resilient storage layer when constructing their pipelines. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +Here is an example of how to use the `S3Resource` in a Dagster job to interact with AWS S3: + +```python + +import pandas as pd +from dagster import Definitions, asset +from dagster_aws.s3 import S3Resource + + +@asset +def my_s3_asset(s3: S3Resource): + df = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]}) + + csv_data = df.to_csv(index=False) + + s3_client = s3.get_client() + + s3_client.put_object( + Bucket="my-cool-bucket", + Key="path/to/my_dataframe.csv", + Body=csv_data, + ) + + +defs = Definitions( + assets=[my_s3_asset], + resources={"s3": S3Resource(region_name="us-west-2")}, +) +``` + +``` + +### About AWS S3 + +**Amazon Simple Storage Service (Amazon S3)** is an object storage service that offers industry-leading scalability, data availability, security, and performance. This means customers of all sizes and industries can use it to store and protect any amount of data for a range of use cases, such as data lakes, websites, mobile applications, backup and restore, archive, enterprise applications, IoT devices, and big data analytics. Amazon S3 provides easy-to-use management features so you can organize your data and configure finely-tuned access controls to meet your specific business, organizational, and compliance requirements. +``` diff --git a/docs/docs-beta/docs/integrations/aws-secretsmanager.md b/docs/docs-beta/docs/integrations/aws-secretsmanager.md new file mode 100644 index 0000000000000..dbe7d20c221a3 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-secretsmanager.md @@ -0,0 +1,67 @@ +--- +layout: Integration +status: published +name: AWS Secrets Manager +title: Dagster & AWS Secrets Manager +sidebar_label: AWS Secrets Manager +excerpt: This integration allows you to manage, retrieve, and rotate credentials, API keys, and other secrets using AWS Secrets Manager. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-secretsmanager.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to manage, retrieve, and rotate credentials, API keys, and other secrets using [AWS Secrets Manager](https://aws.amazon.com/secrets-manager/). + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +from dagster import asset, Definitions +from dagster_aws.secretsmanager import ( + SecretsManagerResource, + SecretsManagerSecretsResource, +) + + +@asset +def my_asset(secretsmanager: SecretsManagerResource): + secret_value = secretsmanager.get_client().get_secret_value( + SecretId="arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf" + ) + return secret_value + + +@asset +def my_other_asset(secrets: SecretsManagerSecretsResource): + secret_value = secrets.fetch_secrets().get("my-secret-name") + return secret_value + + +defs = Definitions( + assets=[my_asset, my_other_asset], + resources={ + "secretsmanager": SecretsManagerResource(region_name="us-west-1"), + "secrets": SecretsManagerSecretsResource( + region_name="us-west-1", + secrets_tag="dagster", + ), + }, +) +``` + +### About AWS Secrets Manager + +**AWS Secrets Manager** helps you protect access to your applications, services, and IT resources without the upfront cost and complexity of managing your own hardware security module infrastructure. With Secrets Manager, you can rotate, manage, and retrieve database credentials, API keys, and other secrets throughout their lifecycle. Users and applications retrieve secrets with a call to Secrets Manager APIs, eliminating the need to hardcode sensitive information in plain text. diff --git a/docs/docs-beta/docs/integrations/aws-ssm.md b/docs/docs-beta/docs/integrations/aws-ssm.md new file mode 100644 index 0000000000000..243dbe6c6b6b7 --- /dev/null +++ b/docs/docs-beta/docs/integrations/aws-ssm.md @@ -0,0 +1,70 @@ +--- +layout: Integration +status: published +name: AWS Systems Parameter Store +title: Dagster & AWS Systems Parameter Store +sidebar_label: AWS Systems Parameter Store +excerpt: The Dagster AWS Systems Manager (SSM) Parameter Store integration allows you to manage and retrieve parameters stored in AWS SSM Parameter Store directly within your Dagster pipelines. +date: 2024-06-21 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-aws +docslink: +partnerlink: https://aws.amazon.com/ +logo: /integrations/aws-ssm.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +The Dagster AWS Systems Manager (SSM) Parameter Store integration allows you to manage and retrieve parameters stored in AWS SSM Parameter Store directly within your Dagster pipelines. This integration provides resources to fetch parameters by name, tags, or paths, and optionally set them as environment variables for your operations. + +### Installation + +```bash +pip install dagster-aws +``` + +### Examples + +```python +from dagster import asset, Definitions +from dagster_aws.ssm import ParameterStoreResource, ParameterStoreTag + + +@asset +def example_parameter_store_asset(parameter_store: ParameterStoreResource): + parameter_value = parameter_store.fetch_parameters( + parameters=["my-parameter-name"] + ).get("my-parameter-name") + return parameter_value + + +@asset +def example_parameter_store_asset_with_env(parameter_store: ParameterStoreResource): + import os + + with parameter_store.parameters_in_environment(): + return os.getenv("my-other-parameter-name") + + +defs = Definitions( + assets=[example_parameter_store_asset, example_parameter_store_asset_with_env], + resources={ + "parameter_store": ParameterStoreResource( + region_name="us-west-1", + parameter_tags=[ + ParameterStoreTag(key="my-tag-key", values=["my-tag-value"]) + ], + with_decryption=True, + ) + }, +) +``` + +### About AWS Systems Parameter Store + +**AWS Systems Manager Parameter Store** is a secure storage service for configuration data management and secrets management. It allows you to store data such as passwords, database strings, and license codes as parameter values. You can then reference these parameters in your applications or scripts, ensuring that sensitive information isn't hard-coded or exposed in your codebase. + +AWS Systems Manager Parameter Store integrates with AWS Identity and Access Management (IAM) to control access to parameters, and it supports encryption using AWS Key Management Service (KMS) to protect sensitive data. This service is essential for maintaining secure and manageable configurations across your AWS environment. diff --git a/docs/docs-beta/docs/integrations/azure-adls2.md b/docs/docs-beta/docs/integrations/azure-adls2.md new file mode 100644 index 0000000000000..6bedf416e192a --- /dev/null +++ b/docs/docs-beta/docs/integrations/azure-adls2.md @@ -0,0 +1,64 @@ +--- +layout: Integration +status: published +name: Azure Data Lake Storage Gen 2 (ADLS2) +title: Dagster & Azure Data Lake Storage Gen 2 (ADLS2) +sidebar_label: Azure Data Lake Storage Gen 2 (ADLS2) +excerpt: Get utilities for ADLS2 and Blob Storage. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-azure +docslink: +partnerlink: https://azure.microsoft.com/ +logo: /integrations/Azure.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +Dagster helps you use Azure Storage Accounts as part of your data pipeline. Azure Data Lake Storage Gen 2 (ADLS2) is our primary focus but we also provide utilities for Azure Blob Storage. + +### Installation + +```bash +pip install dagster-azure +``` + +### Examples + +```python +import pandas as pd +from dagster import Definitions, asset, job +from dagster_azure.adls2 import ADLS2Resource, ADLS2SASToken + + +@asset +def example_adls2_asset(adls2: ADLS2Resource): + df = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]}) + + csv_data = df.to_csv(index=False) + + file_client = adls2.adls2_client.get_file_client( + "my-file-system", "path/to/my_dataframe.csv" + ) + file_client.upload_data(csv_data, overwrite=True) + + +defs = Definitions( + assets=[example_adls2_asset], + resources={ + "adls2": ADLS2Resource( + storage_account="my_storage_account", + credential=ADLS2SASToken(token="my_sas_token"), + ) + }, +) +``` + +In this updated code, we use `ADLS2Resource` directly instead of `adls2_resource`. The configuration is passed to `ADLS2Resource` during its instantiation. + +### About Azure Data Lake Storage Gen 2 (ADLS2) + +**Azure Data Lake Storage Gen 2 (ADLS2)** is a set of capabilities dedicated to big data analytics, built on Azure Blob Storage. ADLS2 combines the scalability, cost-effectiveness, security, and rich capabilities of Azure Blob Storage with a high-performance file system that's built for analytics and is compatible with the Hadoop Distributed File System (HDFS). This makes it an ideal choice for data lakes and big data analytics. diff --git a/docs/docs-beta/docs/integrations/census.md b/docs/docs-beta/docs/integrations/census.md new file mode 100644 index 0000000000000..cb8b44d0cb6a5 --- /dev/null +++ b/docs/docs-beta/docs/integrations/census.md @@ -0,0 +1,49 @@ +--- +layout: Integration +status: published +name: Census +title: Dagster & Census +sidebar_label: Census +excerpt: Trigger Census synchs from within your Dagster pipelines. +date: 2022-11-07 +apireflink: http://docs.dagster.io/_apidocs/libraries/dagster-census +partnerlink: https://www.getcensus.com/ +communityIntegration: true +logo: /integrations/Census.svg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +With the `dagster-census` integration you can execute a Census sync and poll until that sync completes, raising an error if it's unsuccessful. + +### Installation + +```bash +pip install dagster-census +``` + +### Example + +```python +import dagster as dg +from dagster_census import CensusResource + + +@dg.asset +def census_source(census: CensusResource): + census.get_source(source_id=1) + + +defs = dg.Definitions( + assets=[census_source], + resources={"census": CensusResource(api_key=dg.EnvVar("CENSUS_API_KEY"))}, +) +``` + +### About Census + +**Census** syncs data from your cloud warehouse to the SaaS tools your organization uses. It allows everyone in your organization to take action with good data, no custom scripts or API integrations required. diff --git a/docs/docs-beta/docs/integrations/cube.md b/docs/docs-beta/docs/integrations/cube.md new file mode 100644 index 0000000000000..03abcaac72fda --- /dev/null +++ b/docs/docs-beta/docs/integrations/cube.md @@ -0,0 +1,61 @@ +--- +layout: Integration +status: published +name: Cube +title: Dagster & Cube +sidebar_label: Cube +excerpt: "Push changes from upstream data sources to Cubes semantic layer." +date: 2023-08-30 +apireflink: https://cube.dev/docs/orchestration-api/dagster +partnerlink: https://cube.dev/ +communityIntegration: true +logo: /integrations/cube.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +With the `dagster_cube` integration you can setup Cube and Dagster to work together so that Dagster can push changes from upstream data sources to Cube using its integration API. + +### Installation + +```bash +pip install dagster_cube +``` + +### Example + +```python +import dagster as dg +from dagster_cube import CubeResource + + +@dg.asset +def cube_query_workflow(cube: CubeResource): + response = cube.make_request( + method="POST", + endpoint="load", + data={"query": {"measures": ["Orders.count"], "dimensions": ["Orders.status"]}}, + ) + + return response + + +defs = dg.Definitions( + assets=[cube_query_workflow], + resources={ + "cube": CubeResource( + instance_url="https://<>.cubecloudapp.dev/cubejs-api/v1/", + api_key=dg.EnvVar("CUBE_API_KEY"), + ) + }, +) + +``` + +### About Cube + +**Cube.js** is the semantic layer for building data applications. It helps data engineers and application developers access data from modern data stores, organize it into consistent definitions, and deliver it to every application. diff --git a/docs/docs-beta/docs/integrations/databricks.md b/docs/docs-beta/docs/integrations/databricks.md new file mode 100644 index 0000000000000..e3abb26b20995 --- /dev/null +++ b/docs/docs-beta/docs/integrations/databricks.md @@ -0,0 +1,138 @@ +--- +layout: Integration +status: published +name: Databricks +title: Dagster & Databricks +sidebar_label: Databricks +excerpt: The Databricks integration enables you to initiate Databricks jobs directly from Dagster, seamlessly pass parameters to your code, and stream logs and structured messages back into Dagster. +date: 2024-08-20 +apireflink: https://docs.dagster.io/concepts/dagster-pipes/databricks +docslink: +partnerlink: https://databricks.com/ +logo: /integrations/databricks.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +The `dagster-databricks` integration library provides the `PipesDatabricksClient` resource, enabling you to launch Databricks jobs directly from Dagster assets and ops. This integration allows you to pass parameters to Databricks code while Dagster receives real-time events, such as logs, asset checks, and asset materializations, from the initiated jobs. With minimal code changes required on the job side, this integration is both efficient and easy to implement. + +### Installation + +```bash +pip install dagster-databricks +``` + +### Example + +#### Dagster code + +```python +import os +import sys + +from dagster_databricks import PipesDatabricksClient + +from dagster import AssetExecutionContext, Definitions, EnvVar, asset +from databricks.sdk import WorkspaceClient +from databricks.sdk.service import jobs + + +@asset +def databricks_asset( + context: AssetExecutionContext, pipes_databricks: PipesDatabricksClient +): + task = jobs.SubmitTask.from_dict( + { + # The cluster settings below are somewhat arbitrary. Dagster Pipes is + # not dependent on a specific spark version, node type, or number of + # workers. + "new_cluster": { + "spark_version": "12.2.x-scala2.12", + "node_type_id": "i3.xlarge", + "num_workers": 0, + "cluster_log_conf": { + "dbfs": {"destination": "dbfs:/cluster-logs-dir-noexist"}, + }, + }, + "libraries": [ + # Include the latest published version of dagster-pipes on PyPI + # in the task environment + {"pypi": {"package": "dagster-pipes"}}, + ], + "task_key": "some-key", + "spark_python_task": { + "python_file": "dbfs:/my_python_script.py", # location of target code file + "source": jobs.Source.WORKSPACE, + }, + } + ) + + print("This will be forwarded back to Dagster stdout") + print("This will be forwarded back to Dagster stderr", file=sys.stderr) + + extras = {"some_parameter": 100} + + return pipes_databricks.run( + task=task, + context=context, + extras=extras, + ).get_materialize_result() + + +pipes_databricks_resource = PipesDatabricksClient( + client=WorkspaceClient( + host=os.environ["DATABRICKS_HOST"], + token=os.environ["DATABRICKS_TOKEN"], + ) +) + +defs = Definitions( + assets=[databricks_asset], resources={"pipes_databricks": pipes_databricks_resource} +) +``` + +#### Databricks code + +```python +from dagster_pipes import ( + PipesDbfsContextLoader, + PipesDbfsMessageWriter, + open_dagster_pipes, +) + +# Sets up communication channels and downloads the context data sent from Dagster. +# Note that while other `context_loader` and `message_writer` settings are +# possible, it is recommended to use `PipesDbfsContextLoader` and +# `PipesDbfsMessageWriter` for Databricks. +with open_dagster_pipes( + context_loader=PipesDbfsContextLoader(), + message_writer=PipesDbfsMessageWriter(), +) as pipes: + # Access the `extras` dict passed when launching the job from Dagster. + some_parameter_value = pipes.get_extra("some_parameter") + + # Stream log message back to Dagster + pipes.log.info(f"Using some_parameter value: {some_parameter_value}") + + # ... your code that computes and persists the asset + + # Stream asset materialization metadata and data version back to Dagster. + # This should be called after you've computed and stored the asset value. We + # omit the asset key here because there is only one asset in scope, but for + # multi-assets you can pass an `asset_key` parameter. + pipes.report_asset_materialization( + metadata={ + "some_metric": {"raw_value": some_parameter_value + 1, "type": "int"} + }, + data_version="alpha", + ) + +``` + +### About Databricks + +**Databricks** is a unified data analytics platform that simplifies and accelerates the process of building big data and AI solutions. It integrates seamlessly with Apache Spark and offers support for various data sources and formats. Databricks provides powerful tools to create, run, and manage data pipelines, making it easier to handle complex data engineering tasks. Its collaborative and scalable environment is ideal for data engineers, scientists, and analysts who need to process and analyze large datasets efficiently. diff --git a/docs/docs-beta/docs/integrations/datadog.md b/docs/docs-beta/docs/integrations/datadog.md new file mode 100644 index 0000000000000..a1a00cf2764b0 --- /dev/null +++ b/docs/docs-beta/docs/integrations/datadog.md @@ -0,0 +1,60 @@ +--- +layout: Integration +status: published +name: Datadog +title: Dagster & Datadog +sidebar_label: Datadog +excerpt: Publish metrics to Datadog from within Dagster ops and entralize your monitoring metrics. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-datadog +docslink: +partnerlink: https://www.datadoghq.com/ +logo: /integrations/Datadog.svg +categories: + - Monitoring +enabledBy: +enables: +--- + +### About this integration + +While Dagster provides comprehensive monitoring and observability of the pipelines it orchestrates, many teams look to centralize all their monitoring across apps, processes and infrastructure using Datadog's 'Cloud Monitoring as a Service'. The `dagster-datadog` integration allows you to publish metrics to Datadog from within Dagster ops. + +### Installation + +```bash +pip install dagster-datadog +``` + +### Example + +```python +import os + +import dagster as dg +from dagster_datadog import DatadogResource + + +@dg.asset +def report_to_datadog(datadog: DatadogResource): + datadog_client = datadog.get_client() + datadog_client.event("Man down!", "This server needs assistance.") + datadog_client.gauge("users.online", 1001, tags=["protocol:http"]) + datadog_client.increment("page.views") + + +defs = dg.Definitions( + assets=[report_to_datadog], + resources={ + "datadog": DatadogResource( + api_key=os.environ["DATADOG_API_KEY"], + app_key=os.environ["DATADOG_APP_KEY"], + ) + }, +) + +``` + +### About Datadog + +**Datadog** is an observability service for cloud-scale applications, providing monitoring of servers, databases, tools, and services, through a SaaS-based data analytics platform. diff --git a/docs/docs-beta/docs/integrations/dbt-cloud.md b/docs/docs-beta/docs/integrations/dbt-cloud.md new file mode 100644 index 0000000000000..4a3c7abac5eec --- /dev/null +++ b/docs/docs-beta/docs/integrations/dbt-cloud.md @@ -0,0 +1,51 @@ +--- +layout: Integration +status: published +name: dbt Cloud +title: Dagster & dbt Cloud +sidebar_label: dbt Cloud +excerpt: Run dbt Cloud™ jobs as part of your data pipeline. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-dbt#assets-dbt-cloud +docslink: https://docs.dagster.io/integrations/dbt_cloud +partnerlink: +logo: /integrations/dbt.svg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +Dagster allows you to run dbt Cloud jobs alongside other technologies. You can schedule them to run as a step in a larger pipeline and manage them as a data asset. + +### Installation + +```bash +pip install dagster-dbt +``` + +### Example + +```python +from dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job +import os + +# configure a resource to connect to your dbt Cloud instance +dbt_cloud = dbt_cloud_resource.configured( + {"auth_token": os.environ["DBT_CLOUD_AUTH_TOKEN"], "account_id": 11111} +) + +# import assets from dbt +dbt_cloud_assets = load_assets_from_dbt_cloud_job( + dbt_cloud=dbt_cloud, + job_id=33333, +) +``` + +### About dbt Cloud + +**dbt Cloud** is a hosted service for running dbt jobs. It helps data analysts and engineers productionize dbt deployments. Beyond dbt open source, dbt Cloud provides scheduling , CI/CD, serving documentation, and monitoring & alerting. + +If you're currently using dbt Cloud™, you can also use Dagster to run `dbt-core` in its place. You can read more about [how to do that here](https://dagster.io/blog/migrate-off-dbt-cloud). diff --git a/docs/docs-beta/docs/integrations/dbt.md b/docs/docs-beta/docs/integrations/dbt.md new file mode 100644 index 0000000000000..18e742a923358 --- /dev/null +++ b/docs/docs-beta/docs/integrations/dbt.md @@ -0,0 +1,87 @@ +--- +layout: Integration +status: published +name: dbt +title: Dagster & dbt +sidebar_label: dbt +excerpt: Put your dbt transformations to work, directly from within Dagster. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-dbt +docslink: https://docs.dagster.io/integrations/dbt +partnerlink: https://www.getdbt.com/ +logo: /integrations/dbt.svg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +Dagster orchestrates dbt alongside other technologies, so you can schedule dbt with Spark, Python, etc. in a single data pipeline. + +Dagster assets understand dbt at the level of individual dbt models. This means that you can: + +- Use Dagster's UI or APIs to run subsets of your dbt models, seeds, and snapshots. +- Track failures, logs, and run history for individual dbt models, seeds, and snapshots. +- Define dependencies between individual dbt models and other data assets. For example, put dbt models after the Fivetran-ingested table that they read from, or put a machine learning after the dbt models that it's trained from. + +### Installation + +```bash +pip install dagster-dbt +``` + +### Example + +```python +from pathlib import Path + +from dagster import AssetExecutionContext, Definitions +from dagster_dbt import ( + DbtCliResource, + DbtProject, + build_schedule_from_dbt_selection, + dbt_assets, +) + +RELATIVE_PATH_TO_MY_DBT_PROJECT = "./my_dbt_project" + +my_project = DbtProject( + project_dir=Path(__file__) + .joinpath("..", RELATIVE_PATH_TO_MY_DBT_PROJECT) + .resolve(), +) +my_project.prepare_if_dev() + + +@dbt_assets(manifest=my_project.manifest_path) +def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource): + yield from dbt.cli(["build"], context=context).stream() + + +my_schedule = build_schedule_from_dbt_selection( + [my_dbt_assets], + job_name="materialize_dbt_models", + cron_schedule="0 0 * * *", + dbt_select="fqn:*", +) + +defs = Definitions( + assets=[my_dbt_assets], + schedules=[my_schedule], + resources={ + "dbt": DbtCliResource(project_dir=my_project), + }, +) +``` + +### About dbt + +**dbt** is a SQL-first transformation workflow that lets teams quickly and collaboratively deploy analytics code following software engineering best practices like modularity, portability, CI/CD, and documentation. + + diff --git a/docs/docs-beta/docs/integrations/deltalake.md b/docs/docs-beta/docs/integrations/deltalake.md new file mode 100644 index 0000000000000..b3133033aad12 --- /dev/null +++ b/docs/docs-beta/docs/integrations/deltalake.md @@ -0,0 +1,43 @@ +--- +layout: Integration +status: published +name: Delta Lake +title: Dagster & Delta Lake +sidebar_label: Delta Lake +excerpt: Integrate your pipelines into Delta Lake. +date: 2022-11-07 +communityIntegration: true +apireflink: https://delta-io.github.io/delta-rs/integrations/delta-lake-dagster/ +docslink: +partnerlink: https://delta.io/ +logo: /integrations/DeltaLake.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +Delta Lake is a great storage format for Dagster workflows. With this integration, you can use the Delta Lake I/O Manager to read and write your Dagster assets. + +Here are some of the benefits that Delta Lake provides Dagster users: + +- Native PyArrow integration for lazy computation of large datasets +- More efficient querying with file skipping with Z Ordering and liquid clustering +- Built-in vacuuming to remove unnecessary files and versions +- ACID transactions for reliable writes +- Smooth versioning integration (versions can be use to trigger downstream updates). +- Surfacing table stats based on the file statistics + +### Installation + +```bash +pip install dagster-deltalake +pip install dagster-deltalake-pandas +pip install dagster-deltalake-polars +``` + +### About Delta Lake + +Delta Lake is an open source storage framework that enables building a Lakehouse architecture with compute engines including Spark, PrestoDB, Flink, Trino, and Hive and APIs for Scala, Java, Rust, and Python. diff --git a/docs/docs-beta/docs/integrations/dlt.md b/docs/docs-beta/docs/integrations/dlt.md new file mode 100644 index 0000000000000..f38dd9071da01 --- /dev/null +++ b/docs/docs-beta/docs/integrations/dlt.md @@ -0,0 +1,66 @@ +--- +layout: Integration +status: published +name: dlt +title: Dagster & dlt +sidebar_label: dlt +excerpt: Easily ingest and replicate data between systems with dlt through Dagster. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-embedded-elt +docslink: https://docs.dagster.io/integrations/embedded-elt/dlt +partnerlink: https://www.getdbt.com/ +logo: /integrations/dlthub.jpeg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to use [dlt](https://dlthub.com/) to easily ingest and replicate data between systems through Dagster. + +### Installation + +```bash +pip install dagster-embedded-elt +``` + +### Example + +```python +import dagster as dg +from dagster_embedded_elt.dlt import DagsterDltResource, dlt_assets +from dlt import pipeline +from dlt_sources.github import github_reactions + + +@dlt_assets( + dlt_source=github_reactions("dagster-io", "dagster"), + dlt_pipeline=pipeline( + pipeline_name="github_issues", + dataset_name="github", + destination="snowflake", + ), + name="github", + group_name="github", +) +def github_issues_to_snowflake_assets( + context: dg.AssetExecutionContext, dlt: DagsterDltResource +): + yield from dlt.run(context=context) + + +defs = dg.Definitions( + assets=[ + github_issues_to_snowflake_assets, + ], + resources={ + "dlt": DagsterDltResource(), + }, +) +``` + +### About dlt + +[Data Load Tool (dlt)](https://dlthub.com/) is an open source library for creating efficient data pipelines. It offers features like secret management, data structure conversion, incremental updates, and pre-built sources and destinations, simplifying the process of loading messy data into well-structured datasets. diff --git a/docs/docs-beta/docs/integrations/docker.md b/docs/docs-beta/docs/integrations/docker.md new file mode 100644 index 0000000000000..31974e231a089 --- /dev/null +++ b/docs/docs-beta/docs/integrations/docker.md @@ -0,0 +1,67 @@ +--- +layout: Integration +status: published +name: Docker +title: Dagster & Docker +sidebar_label: Docker +excerpt: Run runs external processes in docker containers directly from Dagster. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-docker +docslink: +partnerlink: https://www.docker.com/ +logo: /integrations/Docker.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +The `dagster-docker` integration library provides the `PipesDockerClient` resource, enabling you to launch Docker containers and execute external code directly from Dagster assets and ops. This integration allows you to pass parameters to Docker containers while Dagster receives real-time events, such as logs, asset checks, and asset materializations, from the initiated jobs. With minimal code changes required on the job side, this integration is both efficient and easy to implement. + +### Installation + +```bash +pip install dagster-docker +``` + +### Example + +```python +import dagster as dg +from dagster_docker import PipesDockerClient + + +@dg.asset +def docker_pipes_asset( + context: dg.AssetExecutionContext, docker_pipes_client: PipesDockerClient +): + docker_image = "python:3.9-slim" + return docker_pipes_client.run( + image=docker_image, + command=[ + "python", + "-m", + "my_module", + ], + context=context, + ).get_results() + + +defs = dg.Definitions( + assets=[docker_pipes_asset], + resources={ + "docker_pipes_client": PipesDockerClient(), + }, +) +``` + +### Deploying to Docker? + +- Deploying to Dagster+: Use with a Dagster+ Hybrid deployment, the Docker agent executes Dagster jobs on a Docker cluster. Checkout the [Dagster+ Docker Agent](https://docs.dagster.io/dagster-plus/deployment/agents/docker) guide for more information. +- Deploying to Open Source: Visit the [Deploying Dagster to Docker](https://docs.dagster.io/deployment/guides/docker) guide for more information. + +### About Docker + +**Docker** is a set of platform-as-a-service products that use OS-level virtualization to deliver software in packages called containers. The service has both free and premium tiers. The software that hosts the containers is called Docker Engine. diff --git a/docs/docs-beta/docs/integrations/duckdb.md b/docs/docs-beta/docs/integrations/duckdb.md new file mode 100644 index 0000000000000..f3a3469a0c9e0 --- /dev/null +++ b/docs/docs-beta/docs/integrations/duckdb.md @@ -0,0 +1,50 @@ +--- +layout: Integration +status: published +name: DuckDB +title: Dagster & DuckDB +sidebar_label: DuckDB +excerpt: Read and write natively to DuckDB from Software Defined Assets. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-duckdb +docslink: https://dagster.io/blog/duckdb-data-lake +partnerlink: https://duckdb.org/ +logo: /integrations/Duckdb.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +This library provides an integration with the DuckDB database, and allows for an out-of-the-box [I/O Manager](https://docs.dagster.io/concepts/io-management/io-managers) so that you can make DuckDB your storage of choice. + +### Installation + +```bash +pip install dagster-duckdb +``` + +### Example + +```python +from dagster_duckdb_pandas import DuckDBPandasIOManager +from dagster import Definitions, asset +import pandas as pd + +@asset( + key_prefix=["my_schema"] # will be used as the schema in duckdb +) +def my_table() -> pd.DataFrame: # the name of the asset will be the table name + return pd.DataFrame() + +defs = Definitions( + assets=[my_table], + resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")} +) +``` + +### About DuckDB + +**DuckDB** is a column-oriented in-process OLAP database. A typical OLTP relational database like SQLite is row-oriented. In row-oriented database, data is organised physically as consecutive tuples. diff --git a/docs/docs-beta/docs/integrations/fivetran.md b/docs/docs-beta/docs/integrations/fivetran.md new file mode 100644 index 0000000000000..8751164e00067 --- /dev/null +++ b/docs/docs-beta/docs/integrations/fivetran.md @@ -0,0 +1,45 @@ +--- +layout: Integration +status: published +name: Fivetran +title: Dagster & Fivetran +sidebar_label: Fivetran +excerpt: Orchestrate Fivetran connectors and schedule syncs with upstream or downstream dependencies. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-fivetran +docslink: https://docs.dagster.io/integrations/fivetran +partnerlink: https://www.fivetran.com/ +logo: /integrations/Fivetran.svg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +The Dagster-Fivetran integration enables you to orchestrate data ingestion as part of a larger pipeline. Programmatically interact with the Fivetran REST API to initiate syncs and monitor their progress. + +### Installation + +```bash +pip install dagster-fivetran +``` + +### Example + +```python +from dagster import EnvVar +from dagster_fivetran import FivetranResource, load_assets_from_fivetran_instance +import os + +fivetran_instance = FivetranResource( + api_key="some_key", + api_secret=EnvVar("FIVETRAN_SECRET"), +) +fivetran_assets = load_assets_from_fivetran_instance(fivetran_instance) +``` + +### About Fivetran + +**Fivetran** ingests data from SaaS applications, databases, and servers. The data is stored and typically used for analytics. diff --git a/docs/docs-beta/docs/integrations/gcp-bigquery.md b/docs/docs-beta/docs/integrations/gcp-bigquery.md new file mode 100644 index 0000000000000..9c083e4b58d03 --- /dev/null +++ b/docs/docs-beta/docs/integrations/gcp-bigquery.md @@ -0,0 +1,49 @@ +--- +layout: Integration +status: published +name: GCP BigQuery +title: Dagster & GCP BigQuery +sidebar_label: GCP BigQuery +excerpt: Integrate with GCP BigQuery. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-gcp +docslink: +partnerlink: +logo: /integrations/gcp-bigquery.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +The Google Cloud Platform BigQuery integration allows data engineers to easily query and store data in the BigQuery data warehouse through the use of the `BigQueryResource`. + +### Installation + +```bash +pip install dagster-gcp +``` + +### Examples + +```python +from dagster import Definitions, asset +from dagster_gcp import BigQueryResource + + +@asset +def my_table(bigquery: BigQueryResource): + with bigquery.get_client() as client: + client.query("SELECT * FROM my_dataset.my_table") + + +defs = Definitions( + assets=[my_table], resources={"bigquery": BigQueryResource(project="my-project")} +) +``` + +### About Google Cloud Platform BigQuery + +The Google Cloud Platform BigQuery service, offers a fully managed enterprise data warehouse that enables fast SQL queries using the processing power of Google's infrastructure. diff --git a/docs/docs-beta/docs/integrations/gcp-dataproc.md b/docs/docs-beta/docs/integrations/gcp-dataproc.md new file mode 100644 index 0000000000000..b8162c2d933cd --- /dev/null +++ b/docs/docs-beta/docs/integrations/gcp-dataproc.md @@ -0,0 +1,62 @@ +--- +layout: Integration +status: published +name: GCP Dataproc +title: Dagster & GCP Dataproc +sidebar_label: GCP Dataproc +excerpt: Integrate with GCP Dataproc. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-gcp +docslink: +partnerlink: +logo: /integrations/gcp-dataproc.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +Using this integration, you can manage and interact with Google Cloud Platform's Dataproc service directly from Dagster. This integration allows you to create, manage, and delete Dataproc clusters, and submit and monitor jobs on these clusters. + +### Installation + +```bash +pip install dagster-gcp +``` + +### Examples + +```python +from dagster import asset, Definitions +from dagster_gcp.dataproc import DataprocResource + + +dataproc_resource = DataprocResource( + project_id="your-gcp-project-id", + region="your-gcp-region", + cluster_name="your-cluster-name", + cluster_config_yaml_path="path/to/your/cluster/config.yaml", +) + + +@asset +def my_dataproc_asset(dataproc: DataprocResource): + with dataproc.get_client() as client: + job_details = { + "job": { + "placement": {"clusterName": dataproc.cluster_name}, + } + } + client.submit_job(job_details) + + +defs = Definitions( + assets=[my_dataproc_asset], resources={"dataproc": dataproc_resource} +) +``` + +### About Google Cloud Platform Dataproc + +Google Cloud Platform's **Dataproc** is a fully managed and highly scalable service for running Apache Spark, Apache Hadoop, and other open source data processing frameworks. Dataproc simplifies the process of setting up and managing clusters, allowing you to focus on your data processing tasks without worrying about the underlying infrastructure. With Dataproc, you can quickly create clusters, submit jobs, and monitor their progress, all while benefiting from the scalability and reliability of Google Cloud Platform. diff --git a/docs/docs-beta/docs/integrations/gcp-gcs.md b/docs/docs-beta/docs/integrations/gcp-gcs.md new file mode 100644 index 0000000000000..999edb77ed175 --- /dev/null +++ b/docs/docs-beta/docs/integrations/gcp-gcs.md @@ -0,0 +1,59 @@ +--- +layout: Integration +status: published +name: GCP GCS +title: Dagster & GCP GCS +sidebar_label: GCP GCS +excerpt: Integrate with GCP GCS. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-gcp +docslink: +partnerlink: +logo: /integrations/gcp-gcs.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to interact with Google Cloud Storage (GCS) using Dagster. It provides resources, I/O Managers, and utilities to manage and store data in GCS, making it easier to integrate GCS into your data pipelines. + +### Installation + +```bash +pip install dagster-gcp +``` + +### Examples + +```python + +import pandas as pd +from dagster import Definitions, asset +from dagster_gcp.gcs import GCSResource + + +@asset +def my_gcs_asset(gcs: GCSResource): + df = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]}) + + csv_data = df.to_csv(index=False) + + gcs_client = gcs.get_client() + + bucket = gcs_client.bucket("my-cool-bucket") + blob = bucket.blob("path/to/my_dataframe.csv") + blob.upload_from_string(csv_data) + + +defs = Definitions( + assets=[my_gcs_asset], + resources={"gcs": GCSResource(project="my-gcp-project")}, +) +``` + +### About Google Cloud Platform GCS + +**Google Cloud Storage (GCS)**, is a scalable and secure object storage service. GCS is designed for storing and accessing any amount of data at any time, making it ideal for data science, AI infrastructure, and frameworks for ML like AutoML. With this integration, you can leverage GCS for efficient data storage and retrieval within your Dagster pipelines. diff --git a/docs/docs-beta/docs/integrations/github.md b/docs/docs-beta/docs/integrations/github.md new file mode 100644 index 0000000000000..7827cdb61c39e --- /dev/null +++ b/docs/docs-beta/docs/integrations/github.md @@ -0,0 +1,60 @@ +--- +layout: Integration +status: published +name: GitHub +title: Dagster & GitHub +sidebar_label: GitHub +excerpt: Integrate with GitHub Apps and automate operations within your github repositories. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-github +docslink: +partnerlink: https://github.com/ +logo: /integrations/Github.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +This library provides an integration with _[GitHub Apps](https://docs.github.com/en/developers/apps/getting-started-with-apps/about-apps)_ by providing a thin wrapper on the GitHub v4 GraphQL API. This allows for automating operations within your GitHub repositories and with the tighter permissions scopes that GitHub Apps allow for vs using a personal token. + +### Installation + +```bash +pip install dagster-github +``` + +### Example + +```python +import dagster as dg +from dagster_github import GithubResource + + +@dg.asset +def github_asset(github: GithubResource): + github.get_client().create_issue( + repo_name="dagster", + repo_owner="dagster-io", + title="Dagster's first github issue", + body="this open source thing seems like a pretty good idea", + ) + + +defs = dg.Definitions( + assets=[github_asset], + resources={ + "github": GithubResource( + github_app_id=dg.EnvVar("GITHUB_APP_ID"), + github_app_private_rsa_key=dg.EnvVar("GITHUB_PRIVATE_KEY"), + github_installation_id=dg.EnvVar("GITHUB_INSTALLATION_ID"), + ) + }, +) +``` + +### About GitHub + +**GitHub** provides a highly available git repo, access control, bug tracking, software feature requests, task management, continuous integration, and wikis for open source and commercial projects. diff --git a/docs/docs-beta/docs/integrations/hashicorp.md b/docs/docs-beta/docs/integrations/hashicorp.md new file mode 100644 index 0000000000000..04c73312d704a --- /dev/null +++ b/docs/docs-beta/docs/integrations/hashicorp.md @@ -0,0 +1,58 @@ +--- +layout: Integration +status: published +name: HashiCorp Vault +title: Dagster & HashiCorp Vault +sidebar_label: HashiCorp Vault +excerpt: Centrally manage credentials and certificates, then use them in your pipelines. +date: 2022-11-07 +apireflink: +docslink: https://github.com/silentsokolov/dagster-hashicorp +partnerlink: https://www.vaultproject.io/ +communityIntegration: true +logo: /integrations/Hashicorp.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +Package for integrating HashiCorp Vault into Dagster so that you can securely manage tokens and passwords. + +### Installation + +```bash +pip install dagster-hashicorp +``` + +### Example + +```python +# See the Resources docs to learn more: https://docs.dagster.io/concepts/resources + +from dagster import asset, repository, with_resources +from dagster_hashicorp.vault import vault_resource +import os + +@asset(required_resource_keys={"vault"}) +def example_asset(context): + secret_data = context.resources.vault.read_secret( + secret_path="secret/data/foo/bar" + ) + context.log.debug(f"Secret: {secret_data}") + + +assets = with_resources( + [example_asset], + {"vault": vault_resource.configured({ + "url": "vault-host:8200", + "auth_type": {"token": {"token": os.environ['VAULT_AUTH_TOKEN']}}, + })} +) +``` + +### About HashiCorp Vault + +**HashiCorp** provides open source tools and commercial products that enable developers, operators and security professionals to provision, secure, run and connect cloud-computing infrastructure. **HashiCorp Vault** secures, stores, and tightly controls access to tokens, passwords, certificates, API keys, and other secrets in modern computing. diff --git a/docs/docs-beta/docs/integrations/hightouch.md b/docs/docs-beta/docs/integrations/hightouch.md new file mode 100644 index 0000000000000..26cb9008fc288 --- /dev/null +++ b/docs/docs-beta/docs/integrations/hightouch.md @@ -0,0 +1,58 @@ +--- +layout: Integration +status: published +name: Hightouch +title: Dagster & Hightouch +sidebar_label: Hightouch +excerpt: Trigger syncs and monitor them until they complete. +date: 2022-11-07 +docslink: https://github.com/hightouchio/dagster-hightouch +partnerlink: https://hightouch.com/ +communityIntegration: true +logo: /integrations/Hightouch.svg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +With this integration you can trigger Hightouch syncs and monitor them from within Dagster. Fine-tune when Hightouch syncs kick-off, visualize their dependencies, and monitor the steps in your data activation workflow. + +This native integration helps your team more effectively orchestrate the last mile of data analytics—bringing that data from the warehouse back into the SaaS tools your business teams live in. With the `dagster-hightouch` integration, Hightouch users have more granular and sophisticated control over when data gets activated. + +### Installation + +```bash +pip install dagster-hightouch +``` + +### Example + +```python +from dagster import job +from dagster_hightouch.ops import hightouch_sync_op +from dagster_hightouch.resources import ht_resource +import os + +HT_ORG = "39619" + +run_ht_sync_orgs = hightouch_sync_op.configured( + {"sync_id": HT_ORG}, name="hightouch_sfdc_organizations" +) + +@job( + resource_defs={ + "hightouch": ht_resource.configured( + {"api_key": os.environ['HIGHTOUCH_API_KEY']}, + ), + } +) +def ht_sfdc_job(): + ht_orgs = run_ht_sync_orgs() +``` + +### About Hightouch + +**Hightouch** syncs data from any data warehouse into popular SaaS tools that businesses run on. Hightouch uses the power of Reverse ETL to transform core business applications from isolated data islands into powerful integrated solutions. diff --git a/docs/docs-beta/docs/integrations/jupyter.md b/docs/docs-beta/docs/integrations/jupyter.md new file mode 100644 index 0000000000000..92d45b82cd16f --- /dev/null +++ b/docs/docs-beta/docs/integrations/jupyter.md @@ -0,0 +1,22 @@ +--- +layout: Integration +status: published +name: Jupyter Notebooks +title: Dagster & Jupyter Notebooks +sidebar_label: Jupyter Notebooks +excerpt: Dagstermill eliminates the tedious "productionization" of Jupyter notebooks. +date: 2022-11-07 +apireflink: +docslink: https://docs.dagster.io/integrations/dagstermill +partnerlink: +logo: /integrations/Jupyter.svg +enabledBy: + - dagster-dagstermill +categories: + - Compute +enables: +--- + +### About Jupyter + +Fast iteration, the literate combination of arbitrary code with markdown blocks, and inline plotting make notebooks an indispensable tool for data science. The **Dagstermill** package makes it easy to run notebooks using the Dagster tools and to integrate them into data jobs with heterogeneous ops: for instance, Spark jobs, SQL statements run against a data warehouse, or arbitrary Python code. diff --git a/docs/docs-beta/docs/integrations/kubernetes.md b/docs/docs-beta/docs/integrations/kubernetes.md new file mode 100644 index 0000000000000..4c256178f1a80 --- /dev/null +++ b/docs/docs-beta/docs/integrations/kubernetes.md @@ -0,0 +1,61 @@ +--- +layout: Integration +status: published +name: Kubernetes +title: Dagster & Kubernetes +sidebar_label: Kubernetes +excerpt: Launch Kubernetes pods and execute external code directly from Dagster. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-k8s +docslink: https://docs.dagster.io/concepts/dagster-pipes/kubernetes +partnerlink: https://kubernetes.io/ +logo: /integrations/Kubernetes.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +The `dagster-k8s` integration library provides the `PipesK8sClient` resource, enabling you to launch Kubernetes pods and execute external code directly from Dagster assets and ops. This integration allows you to pass parameters to Kubernetes pods while Dagster receives real-time events, such as logs, asset checks, and asset materializations, from the initiated jobs. With minimal code changes required on the job side, this integration is both efficient and easy to implement. + +### Installation + +```bash +pip install dagster-k8s +``` + +### Example + +```python +import dagster as dg +from dagster_k8s import PipesK8sClient + + +@dg.asset +def k8s_pipes_asset( + context: dg.AssetExecutionContext, k8s_pipes_client: PipesK8sClient +): + return k8s_pipes_client.run( + context=context, + image="pipes-example:v1", + ).get_materialize_result() + + +defs = dg.Definitions( + assets=[k8s_pipes_asset], + resources={ + "k8s_pipes_client": PipesK8sClient(), + }, +) +``` + +### Deploying to Kubernetes? + +- Deploying to Dagster+: Use with a Dagster+ Hybrid deployment, the Kubernetes agent executes Dagster jobs on a Kubernetes cluster. Checkout the [Dagster+ Kubernetes Agent](https://docs.dagster.io/dagster-plus/deployment/agents/kubernetes) guide for more information. +- Deploying to Open Source: Visit the [Deploying Dagster to Kubernetes](https://docs.dagster.io/deployment/guides/kubernetes) guide for more information. + +### About Kubernetes + +**Kubernetes** is an open source container orchestration system for automating software deployment, scaling, and management. Google originally designed Kubernetes, but the Cloud Native Computing Foundation now maintains the project. diff --git a/docs/docs-beta/docs/integrations/lakefs.md b/docs/docs-beta/docs/integrations/lakefs.md new file mode 100644 index 0000000000000..709ae2cb77eaf --- /dev/null +++ b/docs/docs-beta/docs/integrations/lakefs.md @@ -0,0 +1,77 @@ +--- +layout: Integration +status: published +name: LakeFS +title: Dagster & LakeFS +sidebar_label: LakeFS +excerpt: lakeFS provides version control and complete lineage over the data lake. +date: 2023-06-27 +communityIntegration: true +apireflink: https://pydocs.lakefs.io/ +docslink: +partnerlink: https://lakefs.io/ +logo: /integrations/lakefs.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +By integrating with lakeFS, a big data scale version control system, you can leverage the versioning capabilities of lakeFS to track changes to your data. This integration allows you to have a complete lineage of your data, from the initial raw data to the transformed and processed data, making it easier to understand and reproduce data transformations. + +With lakeFS and Dagster integration, you can ensure that data flowing through your Dagster jobs is easily reproducible. lakeFS provides a consistent view of your data across different versions, allowing you to troubleshoot pipeline runs and ensure consistent results. + +Furthermore, with lakeFS branching capabilities, Dagster jobs can run on separate branches without additional storage costs, creating isolation and allowing promotion of only high-quality data to production leveraging a CI/CD pipeline for your data. + + +### Installation + +```bash +pip install lakefs-client +``` + +### Example + +```python +from dagster import job, op, get_dagster_logger, Definitions, ResourceParam +import lakefs_client +from lakefs_client import models +from lakefs_client.client import LakeFSClient + +logger = get_dagster_logger() + +configuration = lakefs_client.Configuration() +configuration.username = 'AAAA' +configuration.password = 'BBBBB' +configuration.host = 'https://my-org.us-east-1.lakefscloud.io' + +@op +def create_branch(client: ResourceParam[LakeFSClient]): + branch_id = client.branches.create_branch( + repository='test-repo', + branch_creation=models.BranchCreation( + name='experiment', + source='main')) + logger.info(branch_id) + +@op +def list_branches(client: ResourceParam[LakeFSClient]): + list_branches = client.branches.list_branches(repository='test-repo') + logger.info(list_branches) + +@job +def lakefs_integration_job(): + create_branch() + list_branches() + +defs = Definitions( + jobs=[lakefs_integration_job], + resources={"client": LakeFSClient(configuration)}, +) +``` + +### About lakeFS + +**lakeFS** is on a mission to simplify the lives of data engineers, data scientists and analysts providing a data version control platform at scale. diff --git a/docs/docs-beta/docs/integrations/looker.md b/docs/docs-beta/docs/integrations/looker.md new file mode 100644 index 0000000000000..40f0917ff9e1f --- /dev/null +++ b/docs/docs-beta/docs/integrations/looker.md @@ -0,0 +1,46 @@ +--- +layout: Integration +status: published +name: Looker +title: Dagster & Looker +sidebar_label: Looker +excerpt: The Looker integration allows you to monitor your Looker project as assets in Dagster, along with other data assets. +date: 2024-08-30 +apireflink: +docslink: https://docs.dagster.io/_apidocs/libraries/dagster-looker +partnerlink: https://www.looker.com/ +communityIntegration: true +logo: /integrations/looker.svg +categories: + - BI +enabledBy: +enables: +--- + +### About this integration + +Dagster allows you to represent your Looker project as assets, alongside other your other technologies like dbt and Sling. This allows you to see how your Looker assets are connected to your other data assets, and how changes to other data assets might impact your Looker project. + +### Installation + +```bash +pip install dagster-looker +``` + +### Example + +```python +from pathlib import Path + +import dagster as dg +from dagster_looker import build_looker_asset_specs + +looker_specs = build_looker_asset_specs(project_dir=Path("my_looker_project")) +looker_assets = dg.external_assets_from_specs(looker_specs) + +defs = dg.Definitions(assets=looker_assets) +``` + +### About Looker + +**Looker** is a modern platform for data analytics and visualization. It provides a unified interface for data exploration, modeling, and visualization, making it easier to understand and analyze data. Looker integrates with various data sources and can be used to create interactive reports, dashboards, and visualizations. diff --git a/docs/docs-beta/docs/integrations/meltano.md b/docs/docs-beta/docs/integrations/meltano.md new file mode 100644 index 0000000000000..8a7447138d68e --- /dev/null +++ b/docs/docs-beta/docs/integrations/meltano.md @@ -0,0 +1,50 @@ +--- +layout: Integration +status: published +name: Meltano +title: Dagster & Meltano +sidebar_label: Meltano +excerpt: Tap into open source configurable ETL+ and the Singer integration library. +date: 2023-03-25 +apireflink: +docslink: https://github.com/quantile-development/dagster-meltano#readme +partnerlink: https://meltano.com/ +logo: /integrations/Meltano.svg +categories: + - ETL +communityIntegration: true +enabledBy: +enables: +--- + +### About this integration + +The `dagster-meltano` library allows you to run Meltano using Dagster. Design and configure ingestion jobs using the popular [Singer.io](https://singer.io) specification. + +**Note** that this integration can also be [managed from the Meltano platform](https://hub.meltano.com/utilities/dagster/) using `meltano add utility dagster` and configured using `meltano config dagster set --interactive`. + +### Installation + +```bash +pip install dagster-meltano +``` + +### Example + +```python +from dagster import repository, job +from dagster_meltano import meltano_resource, meltano_run_op + +@job(resource_defs={"meltano": meltano_resource}) +def meltano_run_job(): + tap_done = meltano_run_op("tap-1 target-1")() + meltano_run_op("tap-2 target-2")(tap_done) + +@repository() +def repository(): + return [meltano_run_job] +``` + +### About Meltano + +[Meltano](https://meltano.com/) provides data engineers with a set of tools for easily creating and managing pipelines as code by providing a wide array of composable connectors. Meltano's 'CLI for ELT+' lets you test your changes before they go live. diff --git a/docs/docs-beta/docs/integrations/microsoft-teams.md b/docs/docs-beta/docs/integrations/microsoft-teams.md new file mode 100644 index 0000000000000..5e4b78551afdd --- /dev/null +++ b/docs/docs-beta/docs/integrations/microsoft-teams.md @@ -0,0 +1,52 @@ +--- +layout: Integration +status: published +name: Microsoft Teams +title: Dagster & Microsoft Teams +sidebar_label: Microsoft Teams +excerpt: Keep your team up to speed with Teams messages. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-msteams +docslink: +partnerlink: https://www.microsoft.com/en-us/microsoft-teams/group-chat-software +logo: /integrations/Microsoft Teams.svg +categories: + - Alerting +enabledBy: +enables: +--- + +### About this integration + +By configuring this resource, you can post messages to MS Teams from any Dagster op or asset. + +### Installation + +```bash +pip install dagster-msteams +``` + +### Example + +```python +# Read the docs on Resources to learn more: https://docs.dagster.io/deployment/resources +import dagster as dg +from dagster_msteams import Card, MSTeamsResource + + +@dg.asset +def microsoft_teams_message(msteams: MSTeamsResource): + card = Card() + card.add_attachment(text_message="Hello there!") + msteams.get_client().post_message(payload=card.payload) + + +defs = dg.Definitions( + assets=[microsoft_teams_message], + resources={"msteams": MSTeamsResource(hook_url=dg.EnvVar("TEAMS_WEBHOOK_URL"))}, +) +``` + +### About Microsoft Teams + +**Microsoft Teams** is a business communication platform. Teams offers workspace chat and videoconferencing, file storage, and application integration. diff --git a/docs/docs-beta/docs/integrations/open-metadata.md b/docs/docs-beta/docs/integrations/open-metadata.md new file mode 100644 index 0000000000000..3e7aea771f5c1 --- /dev/null +++ b/docs/docs-beta/docs/integrations/open-metadata.md @@ -0,0 +1,28 @@ +--- +layout: Integration +status: published +name: Open Metadata +title: Dagster & Open Metadata +sidebar_label: Open Metadata +excerpt: Configure and schedule Dagster metadata and profiler workflows from the OpenMetadata UI. +date: 2022-11-07 +apireflink: +docslink: https://docs.open-metadata.org/connectors/pipeline/dagster +partnerlink: https://open-metadata.org/ +communityIntegration: true +logo: /integrations/OpenMetadata.svg +categories: + - Metadata +enabledBy: +enables: +--- + +### About this integration + +With this integration you can create a Open Metadata service to ingest metadata produced by the Dagster application. View the Ingestion Pipeline running from the Open Metadata Service Page. + +### About Open Metadata + +Poorly organized metadata is preventing organizations from realizing the full potential of data. Most metadata is incorrect, inconsistent, stale, missing, and fragmented in silos across various disconnected tools obscuring a holistic picture of data. + +**Open Metadata** is an all-in-one platform for data discovery, data lineage, data quality, observability, governance, and team collaboration. It's one of the fastest growing open source projects with a vibrant community and adoption by a diverse set of companies in a variety of industry verticals. Powered by a centralized metadata store based on Open Metadata Standards/APIs, supporting connectors to a wide range of data services, OpenMetadata enables end-to-end metadata management, giving you the freedom to unlock the value of your data assets. diff --git a/docs/docs-beta/docs/integrations/openai.md b/docs/docs-beta/docs/integrations/openai.md new file mode 100644 index 0000000000000..61564bf032138 --- /dev/null +++ b/docs/docs-beta/docs/integrations/openai.md @@ -0,0 +1,67 @@ +--- +layout: Integration +status: published +name: OpenAI +title: Dagster & OpenAI +sidebar_label: OpenAI +excerpt: Integrate OpenAI calls into your Dagster pipelines, without breaking the bank. +date: 2024-03-12 +apireflink: https://platform.openai.com/docs/introduction +docslink: https://docs.dagster.io/integrations/openai +partnerlink: +logo: /integrations/openai.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +The `dagster-openai` library allows you to easily interact with the OpenAI REST API using the OpenAI Python API to build AI steps into your Dagster pipelines. You can also log OpenAI API usage metadata in Dagster Insights, giving you detailed observability on API call credit consumption. + +When paired with Dagster assets, the resource automatically logs OpenAI usage metadata in asset metadata. + +### Installation + +```bash +pip install dagster dagster-openai +``` + +### Example + +```python +from dagster_openai import OpenAIResource + +from dagster import ( + AssetExecutionContext, + Definitions, + EnvVar, + asset, + define_asset_job, +) + + +@asset(compute_kind="OpenAI") +def openai_asset(context: AssetExecutionContext, openai: OpenAIResource): + with openai.get_client(context) as client: + client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Say this is a test."}], + ) + + +openai_asset_job = define_asset_job(name="openai_asset_job", selection="openai_asset") + +defs = Definitions( + assets=[openai_asset], + jobs=[openai_asset_job], + resources={ + "openai": OpenAIResource(api_key=EnvVar("OPENAI_API_KEY")), + }, +) +``` + +### About OpenAI + +OpenAI is a U.S. based artificial intelligence (AI) research organization with the goal of developing "safe and beneficial" artificial general intelligence, which it defines as "highly autonomous systems that outperform humans at most economically valuable work". diff --git a/docs/docs-beta/docs/integrations/pagerduty.md b/docs/docs-beta/docs/integrations/pagerduty.md new file mode 100644 index 0000000000000..c5272dd4ace05 --- /dev/null +++ b/docs/docs-beta/docs/integrations/pagerduty.md @@ -0,0 +1,57 @@ +--- +layout: Integration +status: published +name: PagerDuty +title: Dagster & PagerDuty +sidebar_label: PagerDuty +excerpt: Centralize your monitoring with the dagster-pagerduty integration. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-pagerduty +docslink: +partnerlink: https://www.pagerduty.com/ +logo: /integrations/PagerDuty.svg +categories: + - Alerting +enabledBy: +enables: +--- + +### About this integration + +This library provides an integration between Dagster and PagerDuty to support creating alerts from your Dagster code. + +### Installation + +```bash +pip install dagster_pagerduty +``` + +### Example + +```python +import dagster as dg +from dagster_pagerduty import PagerDutyService + + +@dg.asset +def pagerduty_alert(pagerduty: PagerDutyService): + pagerduty.EventV2_create( + summary="alert from dagster", + source="localhost", + severity="error", + event_action="trigger", + ) + + +defs = dg.Definitions( + assets=[pagerduty_alert], + resources={ + "pagerduty": PagerDutyService(routing_key="0123456789abcdef0123456789abcdef") + }, +) + +``` + +### About PagerDuty + +**PagerDuty** is a popular SaaS incident response platform. It integrates machine data & human intelligence to improve visibility & agility for Real-Time Operations. diff --git a/docs/docs-beta/docs/integrations/pandas.md b/docs/docs-beta/docs/integrations/pandas.md new file mode 100644 index 0000000000000..b051eb7a98023 --- /dev/null +++ b/docs/docs-beta/docs/integrations/pandas.md @@ -0,0 +1,31 @@ +--- +layout: Integration +status: published +name: Pandas +title: Dagster & Pandas +sidebar_label: Pandas +excerpt: Implement validation on pandas DataFrames. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-pandas +docslink: https://docs.dagster.io/integrations/pandas +partnerlink: https://pandas.pydata.org/ +logo: /integrations/Pandas.svg +categories: + - Metadata +enabledBy: +enables: +--- + +### About this integration + +Perform data validation, emit summary statistics, and enable reliable DataFrame serialization/deserialization. The dagster_pandas library provides you with the utilities for implementing validation on Pandas DataFrames. The Dagster type system generates documentation of your DataFrame constraints and makes it accessible in the Dagster UI. + +### Installation + +```bash +pip install dagster-pandas +``` + +### About Pandas + +**Pandas** is a popular Python package that provides data structures designed to make working with "relational" or "labeled" data both easy and intuitive. Pandas aims to be the fundamental high-level building block for doing practical, real-world data analysis in Python. diff --git a/docs/docs-beta/docs/integrations/pandera.md b/docs/docs-beta/docs/integrations/pandera.md new file mode 100644 index 0000000000000..8374eaa255b56 --- /dev/null +++ b/docs/docs-beta/docs/integrations/pandera.md @@ -0,0 +1,69 @@ +--- +layout: Integration +status: published +name: Pandera +title: Dagster & Pandera +sidebar_label: Pandera +excerpt: Generate Dagster Types from Pandera dataframe schemas. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-pandera +docslink: https://docs.dagster.io/integrations/pandera +partnerlink: https://pandera.readthedocs.io/en/stable/ +logo: /integrations/Pandera.svg +categories: + - Metadata +enabledBy: +enables: +--- + +### About this integration + +The `dagster-pandera` integration library provides an API for generating Dagster Types from [Pandera DataFrame schemas](https://pandera.readthedocs.io/en/stable/dataframe_schemas.html). + +Like all Dagster types, Dagster-Pandera-generated types can be used to annotate op inputs and outputs. This provides runtime type-checking with rich error reporting and allows Dagster UI to display information about a DataFrame's structure. + +### Installation + +```bash +pip install dagster-pandera +``` + +### Example + +```python +import random +import pandas as pd +import pandera as pa +from dagster_pandera import pandera_schema_to_dagster_type +from pandera.typing import Series +from dagster import asset + +APPLE_STOCK_PRICES = { + "name": ["AAPL", "AAPL", "AAPL", "AAPL", "AAPL"], + "date": ["2018-01-22", "2018-01-23", "2018-01-24", "2018-01-25", "2018-01-26"], + "open": [177.3, 177.3, 177.25, 174.50, 172.0], + "close": [177.0, 177.04, 174.22, 171.11, 171.51], +} + + +class StockPrices(pa.SchemaModel): + """Open/close prices for one or more stocks by day.""" + + name: Series[str] = pa.Field(description="Ticker symbol of stock") + date: Series[str] = pa.Field(description="Date of prices") + open: Series[float] = pa.Field(ge=0, description="Price at market open") + close: Series[float] = pa.Field(ge=0, description="Price at market close") + + +@asset(dagster_type=pandera_schema_to_dagster_type(StockPrices)) +def apple_stock_prices_dirty(): + prices = pd.DataFrame(APPLE_STOCK_PRICES) + i = random.choice(prices.index) + prices.loc[i, "open"] = pd.NA + prices.loc[i, "close"] = pd.NA + return prices +``` + +### About Pandera + +**Pandera** is a statistical data testing toolkit, and a data validation library for scientists, engineers, and analysts seeking correctness. diff --git a/docs/docs-beta/docs/integrations/prometheus.md b/docs/docs-beta/docs/integrations/prometheus.md new file mode 100644 index 0000000000000..392f32a24bc75 --- /dev/null +++ b/docs/docs-beta/docs/integrations/prometheus.md @@ -0,0 +1,54 @@ +--- +layout: Integration +status: published +name: Prometheus +title: Dagster & Prometheus +sidebar_label: Prometheus +excerpt: Integrate with Prometheus via the prometheus_client library. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-prometheus +docslink: https://prometheus.io/ +partnerlink: +logo: /integrations/Prometheus.svg +categories: + - Monitoring +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to push metrics to the Prometheus gateway from within a Dagster pipeline. + +### Installation + +```bash +pip install dagster-prometheus +``` + +### Example + +```python +#import dagster as dg +from dagster_prometheus import PrometheusResource + + +@dg.asset +def prometheus_metric(prometheus: PrometheusResource): + prometheus.push_to_gateway(job="my_job_label") + + +defs = dg.Definitions( + assets=[prometheus_metric], + resources={ + "prometheus": PrometheusResource(gateway="http://pushgateway.example.org:9091") + }, +) + +``` + +### About Prometheus + +**Prometheus** is an open source systems monitoring and alerting toolkit. Originally built at SoundCloud, Prometheus joined the Cloud Native Computing Foundation in 2016 as the second hosted project, after Kubernetes. + +Prometheus collects and stores metrics as time series data along with the timestamp at which it was recorded, alongside optional key-value pairs called labels. diff --git a/docs/docs-beta/docs/integrations/sdf.md b/docs/docs-beta/docs/integrations/sdf.md new file mode 100644 index 0000000000000..b1cc2065127e4 --- /dev/null +++ b/docs/docs-beta/docs/integrations/sdf.md @@ -0,0 +1,77 @@ +--- +layout: Integration +status: published +name: SDF +title: Dagster & SDF +sidebar_label: SDF +excerpt: Put your SDF transformations to work, directly from within Dagster. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-sdf +docslink: https://docs.sdf.com/integrations/dagster/getting-started +partnerlink: https://www.sdf.com/ +communityIntegration: true +logo: /integrations/sdf.jpeg +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +SDF can integrate seamlessly with your existing Dagster projects, providing the best-in-class transformation layer while enabling you to schedule, orchestrate, and monitor your dags in Dagster. + +When it comes time to materialize your Dagster assets, you can be confident that SDF has successfully compiled your workspace, making it safe to execute locally or against your cloud data warehouse. + +### Installation + +```bash +pip install dagster-sdf +``` + +### Example + +```python +from pathlib import Path + +import dagster as dg +from dagster_sdf import ( + SdfCliResource, + SdfWorkspace, + sdf_assets, +) + +workspace_dir = Path(__file__).joinpath("./my_sdf_workspace").resolve() +target_dir = workspace_dir.joinpath( + "sdf_dagster_out" +) # The destination for outputs generated by SDF during execution +environment = "dbg" # Replace with your environment, e.g. "prod" + +workspace = SdfWorkspace( + workspace_dir=workspace_dir, + target_dir=target_dir, + environment=environment, +) + + +@sdf_assets(workspace=workspace) +def my_sdf_assets(context: dg.AssetExecutionContext, sdf: SdfCliResource): + yield from sdf.cli( + ["run", "--save", "info-schema"], + target_dir=target_dir, + environment=environment, + context=context, + ).stream() + + +defs = dg.Definitions( + assets=[my_sdf_assets], + resources={ + "sdf": SdfCliResource(workspace_dir=workspace_dir), + }, +) +``` + +### About SDF + +[SDF](https://www.sdf.com/) is a multi-dialect SQL compiler, transformation framework, and analytical database engine. It natively compiles SQL dialects, like Snowflake, and connects to their corresponding data warehouses to materialize models. diff --git a/docs/docs-beta/docs/integrations/secoda.md b/docs/docs-beta/docs/integrations/secoda.md new file mode 100644 index 0000000000000..0ad5625034aa1 --- /dev/null +++ b/docs/docs-beta/docs/integrations/secoda.md @@ -0,0 +1,28 @@ +--- +layout: Integration +status: published +name: Secoda +title: Dagster & Secoda +sidebar_label: Secoda +excerpt: Help your team understand metadata from Dagster by adding context in Secoda. +date: 2024-02-24 +apireflink: +docslink: https://www.secoda.co/automations/automated-documentation-for-new-integrations-in-dagster +partnerlink: https://www.secoda.co/integrations/dagster +communityIntegration: true +logo: /integrations/Secoda.svg +categories: + - Metadata +enabledBy: +enables: +--- + +### About this integration + +Connect Dagster to Secoda and see metadata related to your Dagster assets, asset groups and jobs right in Secoda. Simplify your team's access, and remove the need to switch between tools. + +When you connect Dagster to Secoda, you can use Secoda's tools to add further context to your Dagster assets and jobs. Help your team understand metadata from Dagster by adding context in Secoda, like creating Documents, defining Metrics, and adding Tags. + +### About Secoda + +Secoda is a AI-powered data search, cataloging, lineage, and documentation platform that empowers data teams to manage data sprawl, scale infrastructure, and overcome common issues such as lack of observability, governance, and lengthy setup and integration periods. diff --git a/docs/docs-beta/docs/integrations/shell.md b/docs/docs-beta/docs/integrations/shell.md new file mode 100644 index 0000000000000..03c0996294a61 --- /dev/null +++ b/docs/docs-beta/docs/integrations/shell.md @@ -0,0 +1,56 @@ +--- +layout: Integration +status: published +name: Bash / Shell +title: Dagster & Bash / Shell +sidebar_label: Bash / Shell +excerpt: Execute a Bash/shell command, directly or as a read from a script file. +date: 2024-08-20 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-shell +docslink: +partnerlink: +logo: /integrations/Shell.svg +categories: + - Compute +enabledBy: +enables: +--- + +### About this integration + +Dagster comes with a native `PipesSubprocessClient` resource that enables you to launch shell commands directly from Dagster assets and ops. This integration allows you to pass parameters to external shell scripts while Dagster receives real-time events, such as logs, asset checks, and asset materializations, from the initiated external execution. With minimal code changes required on the job side, this integration is both efficient and easy to implement. + +### Installation + +```bash +pip install dagster +``` + +### Example + +```python +import shutil + +import dagster as dg + + +@dg.asset +def shell_asset( + context: dg.AssetExecutionContext, pipes_subprocess_client: dg.PipesSubprocessClient +) -> None: + shell_script_path = "/path/to/your/script.sh" + return pipes_subprocess_client.run( + command=["bash", shell_script_path], + context=context, + ).get_results() + + +defs = dg.Definitions( + assets=[shell_asset], + resources={"pipes_subprocess_client": dg.PipesSubprocessClient()}, +) +``` + +### About shell + +A shell is a computer program that presents a command line interface which allows you to control your computer using commands entered with a keyboard instead of controlling graphical user interfaces with a mouse/keyboard/touchscreen combination. diff --git a/docs/docs-beta/docs/integrations/slack.md b/docs/docs-beta/docs/integrations/slack.md new file mode 100644 index 0000000000000..1013550456b0e --- /dev/null +++ b/docs/docs-beta/docs/integrations/slack.md @@ -0,0 +1,52 @@ +--- +layout: Integration +status: published +name: Slack +title: Dagster & Slack +sidebar_label: Slack +excerpt: Up your notification game and keep stakeholders in the loop. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-slack +docslink: +partnerlink: https://slack.com/ +logo: /integrations/Slack.svg +categories: + - Alerting +enabledBy: +enables: +--- + +### About this integration + +This library provides an integration with Slack to support posting messages in your company's Slack workspace. + +### Installation + +```bash +pip install dagster-slack +``` + +### Example + +```python +# Read the docs on Resources to learn more: https://docs.dagster.io/deployment/resources + +import dagster as dg +from dagster_slack import SlackResource + + +@dg.asset +def slack_message(slack: SlackResource): + slack.get_client().chat_postMessage(channel="#noise", text=":wave: hey there!") + + +defs = dg.Definitions( + assets=[slack_message], + resources={"slack": SlackResource(token=dg.EnvVar["SLACK_TOKEN"])}, +) + +``` + +### About Slack + +The **Slack** messaging app provides chat, video and voice communication tools and is used extensively across companies and communities. The Dagster slack community can be found at [dagster.io/slack](https://dagster.io/slack). diff --git a/docs/docs-beta/docs/integrations/sling.md b/docs/docs-beta/docs/integrations/sling.md new file mode 100644 index 0000000000000..66ea0285c7be7 --- /dev/null +++ b/docs/docs-beta/docs/integrations/sling.md @@ -0,0 +1,94 @@ +--- +layout: Integration +status: published +name: Sling +title: Dagster & Sling +sidebar_label: Sling +excerpt: Extract and load data from popular data sources to destinations with Sling through Dagster. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-embedded-elt +docslink: https://docs.dagster.io/integrations/embedded-elt/sling +partnerlink: https://slingdata.io/ +logo: /integrations/sling.png +categories: + - ETL +enabledBy: +enables: +--- + +### About this integration + +This integration allows you to use [Sling](https://slingdata.io/) to extract and load data from popular data sources to destinations with high performance and ease. + +### Installation + +```bash +pip install dagster-embedded-elt +``` + +### Example + +```python +import dagster as dg +from dagster_embedded_elt.sling import ( + SlingConnectionResource, + SlingResource, + sling_assets, +) + +source = SlingConnectionResource( + name="MY_PG", + type="postgres", + host="localhost", + port=5432, + database="my_database", + user="my_user", + password=dg.EnvVar("PG_PASS"), +) + +target = SlingConnectionResource( + name="MY_SF", + type="snowflake", + host="hostname.snowflake", + user="username", + database="database", + password=dg.EnvVar("SF_PASSWORD"), + role="role", +) + + +@sling_assets( + replication_config={ + "SOURCE": "MY_PG", + "TARGET": "MY_SF", + "defaults": { + "mode": "full-refresh", + "object": "{stream_schema}_{stream_table}", + }, + "streams": { + "public.accounts": None, + "public.users": None, + "public.finance_departments": {"object": "departments"}, + }, + } +) +def sling_assets(context, sling: SlingResource): + yield from sling.replicate(context=context) + + +defs = dg.Definitions( + assets=[sling_assets], + resources={ + "sling": SlingResource( + connections=[ + source, + target, + ] + ) + }, +) +``` + +### About dlt + +Sling provides an easy-to-use YAML configuration layer for loading data from files, replicating data between databases, exporting custom SQL queries to cloud storage, and much more. diff --git a/docs/docs-beta/docs/integrations/snowflake.md b/docs/docs-beta/docs/integrations/snowflake.md new file mode 100644 index 0000000000000..dedddf0ee3996 --- /dev/null +++ b/docs/docs-beta/docs/integrations/snowflake.md @@ -0,0 +1,60 @@ +--- +layout: Integration +status: published +name: Snowflake +title: Dagster & Snowflake +sidebar_label: Snowflake +excerpt: An integration with the Snowflake data warehouse. Read and write natively to Snowflake from Software Defined Assets. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-snowflake +docslink: https://docs.dagster.io/integrations/snowflake +partnerlink: https://www.snowflake.com/en/ +logo: /integrations/Snowflake.svg +categories: + - Storage +enabledBy: +enables: +--- + +### About this integration + +This library provides an integration with the Snowflake data warehouse. Connect to Snowflake as a resource, then use the integration-provided functions to construct an op to establish connections and execute Snowflake queries. Read and write natively to Snowflake from Dagster assets. + +### Installation + +```bash +pip install dagster-snowflake +``` + +### Example + +```python +# Read the docs on Resources to learn more: https://docs.dagster.io/deployment/resources +# This integration also offers an I/O Manager. Learn more: https://docs.dagster.io/concepts/io-management/io-managers +from dagster import Definitions, EnvVar, asset +from dagster_snowflake import SnowflakeResource +import os + +@asset +def my_table(snowflake: SnowflakeResource): + with snowflake.get_connection() as conn: + return conn.cursor().execute_query("SELECT * FROM foo") + +defs = Definitions( + assets=[my_table], + resources={ + "snowflake": SnowflakeResource( + account="snowflake account", + user="snowflake user", + password=EnvVar("SNOWFLAKE_PASSWORD"), + database="snowflake database", + schema="snowflake schema", + warehouse="snowflake warehouse", + ) + } +) +``` + +### About Snowflake + +A cloud-based data storage and analytics service, generally termed "data-as-a-service". **Snowflake**'s data warehouse is one of the most widely adopted cloud warehouses for analytics. diff --git a/docs/docs-beta/docs/integrations/spark.md b/docs/docs-beta/docs/integrations/spark.md new file mode 100644 index 0000000000000..5b0546e6043c3 --- /dev/null +++ b/docs/docs-beta/docs/integrations/spark.md @@ -0,0 +1,28 @@ +--- +layout: Integration +status: published +name: Spark +title: Dagster & Spark +sidebar_label: Spark +excerpt: Configure and run Spark jobs. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-spark +docslink: https://docs.dagster.io/integrations/spark +partnerlink: +logo: /integrations/Spark.svg +categories: + - Compute +enabledBy: + - dagster-pyspark +enables: +--- + +### About this integration + +Spark jobs typically execute on infrastructure that's specialized for Spark. Spark applications are typically not containerized or executed on Kubernetes. + +Running Spark code often requires submitting code to a Databricks or EMR cluster. `dagster-pyspark` provides a Spark class with methods for configuration and constructing the `spark-submit` command for a Spark job. + +### About Apache Spark + +**Apache Spark** is an open source unified analytics engine for large-scale data processing. Spark provides an interface for programming clusters with implicit data parallelism and fault tolerance. It also provides libraries for graph computation, SQL for structured data processing, ML, and data science. diff --git a/docs/docs-beta/docs/integrations/ssh-sftp.md b/docs/docs-beta/docs/integrations/ssh-sftp.md new file mode 100644 index 0000000000000..7e311dc1c7b33 --- /dev/null +++ b/docs/docs-beta/docs/integrations/ssh-sftp.md @@ -0,0 +1,50 @@ +--- +layout: Integration +status: published +name: SSH/SFTP +title: Dagster & SSH/SFTP +sidebar_label: SSH/SFTP +excerpt: Establish encrypted connections to networked resources. +date: 2022-11-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-ssh +docslink: +partnerlink: https://www.ssh.com/academy/ssh/protocol +logo: /integrations/SSH.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +This integration provides a resource for SSH remote execution using [Paramiko](https://github.com/paramiko/paramiko). It allows you to establish secure connections to networked resources and execute commands remotely. The integration also provides an SFTP client for secure file transfers between the local and remote systems. + +### Installation + +```bash +pip install dagster-ssh +``` + +### Example + +```python +import dagster as dg +from dagster_ssh import SSHResource + + +@dg.asset +def ssh_asset(ssh: SSHResource): + ssh.sftp_get("/path/to/remote.csv", "path/to/local.csv") + + +defs = dg.Definitions( + assets=[ssh_asset], + resources={"ssh": SSHResource(remote_host="foo.com", key_file="path/to/id_rsa")}, +) + +``` + +### About SSH SFTP + +The **SSH protocol** allows for secure remote login with strong authentication to networked resources. It protects network connections with strong encryption. The Dagster library provides direct SSH and SFTP calls from within the execution of your pipelines. diff --git a/docs/docs-beta/docs/integrations/twilio.md b/docs/docs-beta/docs/integrations/twilio.md new file mode 100644 index 0000000000000..d3ef82f7ad00f --- /dev/null +++ b/docs/docs-beta/docs/integrations/twilio.md @@ -0,0 +1,57 @@ +--- +layout: Integration +status: published +name: Twilio +title: Dagster & Twilio +sidebar_label: Twilio +excerpt: Integrate Twilio tasks into your data pipeline runs. +date: 2024-08-30 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-twilio +docslink: +partnerlink: https://www.twilio.com/ +logo: /integrations/Twilio.svg +categories: + - Alerting +enabledBy: +enables: +--- + +### About this integration + +Use your Twilio `Account SID` and `Auth Token` to build Twilio tasks right into your Dagster pipeline. + +### Installation + +```bash +pip install dagster-twilio +``` + +### Example + +```python +# Read the docs on Resources to learn more: https://docs.dagster.io/deployment/resources +import dagster as dg +from dagster_twilio import TwilioResource + + +@dg.asset +def twilio_message(twilio: TwilioResource): + twilio.get_client().messages.create( + to="+15551234567", from_="+15558901234", body="Hello world!" + ) + + +defs = dg.Definitions( + assets=[twilio_message], + resources={ + "twilio": TwilioResource( + account_sid=dg.EnvVar("TWILIO_ACCOUNT_SID"), + auth_token=dg.EnvVar("TWILIO_AUTH_TOKEN"), + ) + }, +) +``` + +### About Twilio + +**Twilio** provides communication APIs for phone calls, text messages, and other communication functions. diff --git a/docs/docs-beta/docs/integrations/wandb.md b/docs/docs-beta/docs/integrations/wandb.md new file mode 100644 index 0000000000000..57a4e0aaad2af --- /dev/null +++ b/docs/docs-beta/docs/integrations/wandb.md @@ -0,0 +1,46 @@ +--- +layout: Integration +status: published +name: Weights & Biases +title: Dagster & Weights & Biases +sidebar_label: Weights & Biases +excerpt: Orchestrate your MLOps pipelines and maintain ML assets. +date: 2023-02-07 +apireflink: https://docs.dagster.io/_apidocs/libraries/dagster-wandb +docslink: https://docs.wandb.ai/guides/integrations/dagster +partnerlink: https://wandb.ai/ +communityIntegration: True +logo: /integrations/WandB.svg +categories: + - Other +enabledBy: +enables: +--- + +### About this integration + +Use Dagster and Weights & Biases (W&B) to orchestrate your MLOps pipelines and maintain ML assets. The integration with W&B makes it easy within Dagster to: + +- use and create W&B Artifacts +- use and create Registered Models in W&B Model Registry +- run training jobs on dedicated compute using W&B Launch +- use the Weights & Biases client in ops and assets + +The W&B Dagster integration provides a W&B-specific Dagster resource and I/O Manager: + +- `wandb_resource`: a Dagster resource used to authenticate and communicate to the W&B API. +- `wandb_artifacts_io_manager`: a Dagster I/O Manager used to consume W&B Artifacts. + +### Installation + +To use this integration you will need a Weights and Biases account. Then you will need an W&B API Key, a W&B entity (user or team), and a W&B project. Full installation details can be found on [the Weights and Biases website here](https://docs.wandb.ai/guides/integrations/other/dagster). + +**Note** that Weights & Biases do offer a free cloud account for personal (non-corporate) use. Check out their [pricing page](https://wandb.ai/site/pricing) for details. + +### Example + +A complete tutorial can be found on [the Weights and Biases website here](https://docs.wandb.ai/guides/integrations/other/dagster). + +### About Weights & Biases + +[Weights & Biases](https://wandb.ai/site) makes it easy to track your experiments, manage & version your data, and collaborate with your team so you can focus on building the best machine learning models. diff --git a/docs/docs-beta/sidebars.ts b/docs/docs-beta/sidebars.ts index ef80bf5047789..9eea761f7c800 100644 --- a/docs/docs-beta/sidebars.ts +++ b/docs/docs-beta/sidebars.ts @@ -1,4 +1,4 @@ -import type { SidebarsConfig } from '@docusaurus/plugin-content-docs'; +import type {SidebarsConfig} from '@docusaurus/plugin-content-docs'; const sidebars: SidebarsConfig = { docs: [ { @@ -30,7 +30,8 @@ const sidebars: SidebarsConfig = { { type: 'category', label: 'Configure', - items: ['guides/configuring-assets', + items: [ + 'guides/configuring-assets', 'guides/asset-factories', 'guides/resources', 'guides/io-managers', @@ -39,11 +40,7 @@ const sidebars: SidebarsConfig = { { type: 'category', label: 'Integrate', - items: [ - 'guides/ingesting-data', - 'guides/transform-dbt', - 'guides/non-python', - ], + items: ['guides/ingesting-data', 'guides/transform-dbt', 'guides/non-python'], }, ], }, @@ -164,6 +161,12 @@ const sidebars: SidebarsConfig = { ], }, ], + integrations: [ + { + type: 'autogenerated', + dirName: 'integrations', + }, + ], dagsterPlus: [ { type: 'category', diff --git a/docs/vale/styles/config/vocabularies/Dagster/accept.txt b/docs/vale/styles/config/vocabularies/Dagster/accept.txt index 3d480e40f1561..9ecd485dc14f0 100644 --- a/docs/vale/styles/config/vocabularies/Dagster/accept.txt +++ b/docs/vale/styles/config/vocabularies/Dagster/accept.txt @@ -1,22 +1,36 @@ +ACID ACR ACS AD +ADLS2 AKS AWS -AWS +AWS Athena +AWS CloudWatch +AWS Glue +AWS Lambda +AWS Secrets Manager +AWS Systems Parameter Store +AWS Redshift Airflow Airbyte +Apache +Azure Data Lake Storage Gen 2 BigQuery chatbot +Census CI/CD CLI[s] Crontab +Cube DAG DSL DataFrame Databricks Datadog +Dataproc Declarative Automation +Delta Lake Docker Dockerfile DuckDB @@ -24,42 +38,67 @@ ECR ECS ECS EKS +EMR ELT Fivetran +Flink GCR GDPR GKE GitHub +Google Cloud Platform +GCS +HashiCorp +HashiCorp Vault +Hightouch HIPAA HPC Helm +Hudi IAM IAM JavaScript +Jupyter +Meltano +Lakehouse +Looker MLflow +MLOps Microsoft Teams MongoDB MySQL Okta +OLAP +OLTP OneLogin OpenAI +Open Metadata +Pandas PEX PagerDuty Pandera PingOne Postgres +Prometheus Pydantic RBAC RDS REST API SCIM +SDF +Secoda +SFTP SHA SLA SLAs SOC +Spark +SSH Snowflake Tensorflow +Trino Twilio +Weights & Biases [Cc]onfig [Mm]aterializable [Mm]aterializations @@ -70,17 +109,22 @@ Twilio [Ss]ubprocess \bDagster\b auditable +composable dagster-.* dbt +dbt Cloud dlt enqueue[ds] frontmatter gRPC +hardcode +lakeFS lookback namespace performant plaintext pluggable +productionize stderr stdout substep