diff --git a/docs/content/api/modules.json.gz b/docs/content/api/modules.json.gz index cf6c2f8907aa1..0d3896cbd89d8 100644 Binary files a/docs/content/api/modules.json.gz and b/docs/content/api/modules.json.gz differ diff --git a/docs/content/api/searchindex.json.gz b/docs/content/api/searchindex.json.gz index 5aa13c7d4a733..baa18eca0502b 100644 Binary files a/docs/content/api/searchindex.json.gz and b/docs/content/api/searchindex.json.gz differ diff --git a/docs/content/api/sections.json.gz b/docs/content/api/sections.json.gz index 6de86d3fc15d9..ca4f3fd3e093d 100644 Binary files a/docs/content/api/sections.json.gz and b/docs/content/api/sections.json.gz differ diff --git a/docs/next/public/objects.inv b/docs/next/public/objects.inv index d0ce4de5c8d98..e0b37f6366531 100644 Binary files a/docs/next/public/objects.inv and b/docs/next/public/objects.inv differ diff --git a/examples/with_great_expectations/setup.py b/examples/with_great_expectations/setup.py index 2d70c4dd0aca1..ef5b56b52572e 100644 --- a/examples/with_great_expectations/setup.py +++ b/examples/with_great_expectations/setup.py @@ -6,7 +6,6 @@ install_requires=[ "dagster", "dagster-ge", - "great_expectations>=0.14.12", # pinned because pip is using the cached wheel for 0.13.14 ], extras_require={"dev": ["dagster-webserver", "pytest"]}, ) diff --git a/examples/with_great_expectations/with_great_expectations/ge_demo.py b/examples/with_great_expectations/with_great_expectations/ge_demo.py index a7cbb4d656932..a66c0ea7eac29 100644 --- a/examples/with_great_expectations/with_great_expectations/ge_demo.py +++ b/examples/with_great_expectations/with_great_expectations/ge_demo.py @@ -32,7 +32,12 @@ def postprocess_payroll(numrows, expectation): # start_ge_demo_marker_factory payroll_expectations = ge_validation_op_factory( - name="ge_validation_op", datasource_name="getest", suite_name="basic.warning" + name="ge_validation_op", + datasource_name="getest", + data_connector_name="my_runtime_data_connector", + data_asset_name="test_asset", + suite_name="basic.warning", + batch_identifiers={"foo": "bar"}, ) # end_ge_demo_marker_factory diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/expectations/.ge_store_backend_id b/examples/with_great_expectations/with_great_expectations/great_expectations/expectations/.ge_store_backend_id deleted file mode 100644 index d52739cbcff45..0000000000000 --- a/examples/with_great_expectations/with_great_expectations/great_expectations/expectations/.ge_store_backend_id +++ /dev/null @@ -1 +0,0 @@ -store_backend_id = 9656cbae-5408-4731-a54c-439358114e8f diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/great_expectations.yml b/examples/with_great_expectations/with_great_expectations/great_expectations/great_expectations.yml deleted file mode 100644 index 2137a263b6b89..0000000000000 --- a/examples/with_great_expectations/with_great_expectations/great_expectations/great_expectations.yml +++ /dev/null @@ -1,117 +0,0 @@ -# Welcome to Great Expectations! Always know what to expect from your data. -# -# Here you can define datasources, batch kwargs generators, integrations and -# more. This file is intended to be committed to your repo. For help with -# configuration please: -# - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration -# - Join our slack channel: http://greatexpectations.io/slack - -config_version: 2.0 - -# Datasources tell Great Expectations where your data lives and how to get it. -# You can use the CLI command `great_expectations datasource new` to help you -# add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html -datasources: - getest: - module_name: great_expectations.datasource - data_asset_type: - module_name: great_expectations.dataset - class_name: PandasDataset - batch_kwargs_generators: - subdir_reader: - class_name: SubdirReaderBatchKwargsGenerator - base_directory: ..\. - class_name: PandasDatasource - getestspark: - module_name: great_expectations.datasource - data_asset_type: - module_name: great_expectations.dataset - class_name: SparkDFDataset - batch_kwargs_generators: - subdir_reader: - class_name: SubdirReaderBatchKwargsGenerator - base_directory: ..\. - class_name: SparkDFDatasource -config_variables_file_path: uncommitted/config_variables.yml - -# The plugins_directory will be added to your python path for custom modules -# used to override and extend Great Expectations. -plugins_directory: plugins/ - -# Validation Operators are customizable workflows that bundle the validation of -# one or more expectation suites and subsequent actions. The example below -# stores validations and send a slack notification. To read more about -# customizing and extending these, read: https://docs.greatexpectations.io/en/latest/features/validation_operators_and_actions.html -validation_operators: - action_list_operator: - # To learn how to configure sending Slack notifications during evaluation - # (and other customizations), read: https://docs.greatexpectations.io/en/latest/reference/validation_operators/action_list_validation_operator.html - class_name: ActionListValidationOperator - action_list: - - name: store_validation_result - action: - class_name: StoreValidationResultAction - - name: store_evaluation_params - action: - class_name: StoreEvaluationParametersAction - - name: update_data_docs - action: - class_name: UpdateDataDocsAction - # - name: send_slack_notification_on_validation_result - # action: - # class_name: SlackNotificationAction - # # put the actual webhook URL in the uncommitted/config_variables.yml file - # slack_webhook: ${validation_notification_slack_webhook} - # notify_on: all # possible values: "all", "failure", "success" - # renderer: - # module_name: great_expectations.render.renderer.slack_renderer - # class_name: SlackRenderer - -stores: -# Stores are configurable places to store things like Expectations, Validations -# Data Docs, and more. These are for advanced users only - most users can simply -# leave this section alone. -# -# Three stores are required: expectations, validations, and -# evaluation_parameters, and must exist with a valid store entry. Additional -# stores can be configured for uses such as data_docs, validation_operators, etc. - expectations_store: - class_name: ExpectationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: expectations/ - - validations_store: - class_name: ValidationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/validations/ - - evaluation_parameter_store: - # Evaluation Parameters enable dynamic expectations. Read more here: - # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html - class_name: EvaluationParameterStore - -expectations_store_name: expectations_store -validations_store_name: validations_store -evaluation_parameter_store_name: evaluation_parameter_store - -data_docs_sites: - # Data Docs make it simple to visualize data quality in your project. These - # include Expectations, Validations & Profiles. The are built for all - # Datasources from JSON artifacts in the local repo including validations & - # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/features/data_docs.html - local_site: - class_name: SiteBuilder - # set to false to hide how-to buttons in Data Docs - show_how_to_buttons: true - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/data_docs/local_site/ - site_index_builder: - class_name: DefaultSiteIndexBuilder - -anonymous_usage_statistics: - data_context_id: 9656cbae-5408-4731-a54c-439358114e8f - enabled: true -notebooks: diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/pandas/validation_playground.ipynb b/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/pandas/validation_playground.ipynb deleted file mode 100644 index 61c5c58a18a1e..0000000000000 --- a/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/pandas/validation_playground.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type PandasDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"PandasDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate a file on a filesystem:\n", - "batch_kwargs = {\"path\": \"YOUR_FILE_PATH\", \"datasource\": datasource_name}\n", - "\n", - "# If you already loaded the data into a Pandas Data Frame:\n", - "batch_kwargs = {\"dataset\": \"YOUR_DATAFRAME\", \"datasource\": datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/spark/validation_playground.ipynb b/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/spark/validation_playground.ipynb deleted file mode 100644 index 555c4dd32b252..0000000000000 --- a/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/spark/validation_playground.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type SparkDFDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"SparkDFDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate a file on a filesystem:\n", - "batch_kwargs = {\"path\": \"YOUR_FILE_PATH\", \"datasource\": datasource_name}\n", - "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n", - "\n", - "# If you already loaded the data into a PySpark Data Frame:\n", - "batch_kwargs = {\"dataset\": \"YOUR_DATAFRAME\", \"datasource\": datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/sql/validation_playground.ipynb b/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/sql/validation_playground.ipynb deleted file mode 100644 index 17e21e1308c4f..0000000000000 --- a/examples/with_great_expectations/with_great_expectations/great_expectations/notebooks/sql/validation_playground.ipynb +++ /dev/null @@ -1,246 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type SqlAlchemyDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"SqlAlchemyDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate an entire table or view in your database's default schema:\n", - "batch_kwargs = {\"table\": \"YOUR_TABLE\", \"datasource\": datasource_name}\n", - "\n", - "# If you would like to validate an entire table or view from a non-default schema in your database:\n", - "batch_kwargs = {\"table\": \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", \"datasource\": datasource_name}\n", - "\n", - "# If you would like to validate the result set of a query:\n", - "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/.gitignore b/examples/with_great_expectations/with_great_expectations/gx/.gitignore similarity index 100% rename from examples/with_great_expectations/with_great_expectations/great_expectations/.gitignore rename to examples/with_great_expectations/with_great_expectations/gx/.gitignore diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/expectations/.ge_store_backend_id b/examples/with_great_expectations/with_great_expectations/gx/expectations/.ge_store_backend_id similarity index 100% rename from python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/expectations/.ge_store_backend_id rename to examples/with_great_expectations/with_great_expectations/gx/expectations/.ge_store_backend_id diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/expectations/basic/warning.json b/examples/with_great_expectations/with_great_expectations/gx/expectations/basic/warning.json similarity index 100% rename from examples/with_great_expectations/with_great_expectations/great_expectations/expectations/basic/warning.json rename to examples/with_great_expectations/with_great_expectations/gx/expectations/basic/warning.json diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/great_expectations.yml b/examples/with_great_expectations/with_great_expectations/gx/great_expectations.yml similarity index 100% rename from python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/great_expectations.yml rename to examples/with_great_expectations/with_great_expectations/gx/great_expectations.yml diff --git a/examples/with_great_expectations/with_great_expectations/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/examples/with_great_expectations/with_great_expectations/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css similarity index 100% rename from examples/with_great_expectations/with_great_expectations/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css rename to examples/with_great_expectations/with_great_expectations/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css diff --git a/examples/with_great_expectations/with_great_expectations_tests/test_ge_example.py b/examples/with_great_expectations/with_great_expectations_tests/test_ge_example.py index 82b35b635de30..2172163b65aff 100644 --- a/examples/with_great_expectations/with_great_expectations_tests/test_ge_example.py +++ b/examples/with_great_expectations/with_great_expectations_tests/test_ge_example.py @@ -1,3 +1,5 @@ +import os + import pytest from dagster import RunConfig from dagster._utils import file_relative_path @@ -6,14 +8,14 @@ from with_great_expectations.definitions import defs from with_great_expectations.ge_demo import GEOpConfig, payroll_data +_GE_ROOT_DIR = file_relative_path(os.path.dirname(__file__), "with_great_expectations/gx") + def test_pipeline_success(): res = payroll_data.execute_in_process( resources={ "ge_data_context": GEContextResource( - ge_root_dir=file_relative_path( - __file__, "../with_great_expectations/great_expectations" - ) + ge_root_dir=_GE_ROOT_DIR, ) }, ) @@ -25,9 +27,7 @@ def test_pipeline_failure(): payroll_data.execute_in_process( resources={ "ge_data_context": GEContextResource( - ge_root_dir=file_relative_path( - __file__, "../with_great_expectations/great_expectations" - ) + ge_root_dir=_GE_ROOT_DIR, ) }, run_config=RunConfig( diff --git a/pyright/alt-1/requirements-pinned.txt b/pyright/alt-1/requirements-pinned.txt index 3d741b58e20d3..e826f1ad36671 100644 --- a/pyright/alt-1/requirements-pinned.txt +++ b/pyright/alt-1/requirements-pinned.txt @@ -24,9 +24,9 @@ backports-tarfile==1.2.0 beautifulsoup4==4.12.3 bleach==6.1.0 boto3==1.35.36 -boto3-stubs-lite==1.35.46 +boto3-stubs-lite==1.35.48 botocore==1.35.36 -botocore-stubs==1.35.46 +botocore-stubs==1.35.48 buildkite-test-collector==0.1.9 cachetools==5.5.0 caio==0.9.17 @@ -40,7 +40,7 @@ coloredlogs==14.0 comm==0.2.2 contourpy==1.3.0 coverage==7.6.4 -croniter==3.0.3 +croniter==3.0.4 cryptography==43.0.3 cycler==0.12.1 daff==1.3.46 @@ -67,7 +67,7 @@ daff==1.3.46 db-dtypes==1.3.0 dbt-adapters==1.3.2 dbt-common==1.3.0 -dbt-core==1.8.7 +dbt-core==1.8.8 dbt-duckdb==1.9.0 dbt-extractor==0.5.1 dbt-semantic-interfaces==0.5.1 @@ -117,12 +117,12 @@ httplib2==0.22.0 httptools==0.6.4 httpx==0.27.2 humanfriendly==10.0 -hypothesis==6.115.3 +hypothesis==6.115.5 idna==3.10 importlib-metadata==6.11.0 iniconfig==2.0.0 ipykernel==6.29.5 -ipython==8.28.0 +ipython==8.29.0 isodate==0.6.1 isoduration==20.11.0 isort==5.13.2 @@ -154,7 +154,7 @@ makefun==1.15.6 mako==1.3.6 markdown-it-py==3.0.0 markupsafe==3.0.2 -mashumaro==3.13.1 +mashumaro==3.14 matplotlib==3.9.2 matplotlib-inline==0.1.7 mccabe==0.7.0 @@ -168,7 +168,7 @@ msgpack==1.1.0 multidict==6.1.0 multimethod==1.10 mypy==1.13.0 -mypy-boto3-ecs==1.35.43 +mypy-boto3-ecs==1.35.48 mypy-boto3-emr==1.35.39 mypy-boto3-emr-serverless==1.35.25 mypy-boto3-glue==1.35.25 @@ -203,7 +203,7 @@ pillow==11.0.0 pip==24.2 platformdirs==4.3.6 pluggy==1.5.0 -polars==1.10.0 +polars==1.11.0 -e examples/project_fully_featured prometheus-client==0.21.0 prompt-toolkit==3.0.48 @@ -244,7 +244,7 @@ pytimeparse==1.1.8 pytz==2024.2 pyyaml==6.0.2 pyzmq==26.2.0 -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 referencing==0.35.1 requests==2.32.3 requests-oauthlib==2.0.0 @@ -264,9 +264,9 @@ send2trash==1.8.3 setuptools==75.2.0 shellingham==1.5.4 six==1.16.0 -slack-sdk==3.33.1 +slack-sdk==3.33.2 sniffio==1.3.1 -snowflake-connector-python==3.12.2 +snowflake-connector-python==3.12.3 snowflake-sqlalchemy==1.5.1 sortedcontainers==2.4.0 soupsieve==2.6 @@ -282,7 +282,7 @@ tabulate==0.9.0 terminado==0.18.1 text-unidecode==1.3 threadpoolctl==3.5.0 -tinycss2==1.3.0 +tinycss2==1.4.0 tomli==2.0.2 tomlkit==0.13.2 toposort==1.10 @@ -292,7 +292,7 @@ tqdm==4.66.5 traitlets==5.14.3 typeguard==4.3.0 typer==0.12.5 -types-awscrt==0.22.4 +types-awscrt==0.23.0 types-backports==0.1.3 types-certifi==2021.10.8.3 types-cffi==1.16.0.20240331 @@ -308,7 +308,7 @@ types-pytz==2024.2.0.20241003 types-pyyaml==6.0.12.20240917 types-requests==2.32.0.20241016 types-s3transfer==0.10.3 -types-setuptools==75.2.0.20241019 +types-setuptools==75.2.0.20241025 types-simplejson==3.19.0.20240801 types-six==1.16.21.20241009 types-sqlalchemy==1.4.53.34 diff --git a/pyright/master/requirements-pinned.txt b/pyright/master/requirements-pinned.txt index 439b296f79a93..3b3de2862872b 100644 --- a/pyright/master/requirements-pinned.txt +++ b/pyright/master/requirements-pinned.txt @@ -17,7 +17,7 @@ apache-airflow==2.7.3 apache-airflow-providers-apache-spark==4.9.0 apache-airflow-providers-cncf-kubernetes==8.3.4 apache-airflow-providers-common-sql==1.15.0 -apache-airflow-providers-docker==3.9.1 +apache-airflow-providers-docker==3.12.3 apache-airflow-providers-ftp==3.10.1 apache-airflow-providers-http==4.1.0 apache-airflow-providers-imap==3.6.1 @@ -38,7 +38,7 @@ asttokens==2.4.1 astunparse==1.6.3 async-lru==2.0.4 attrs==24.2.0 -autodocsumm==0.2.13 +autodocsumm==0.2.14 autoflake==2.3.1 -e python_modules/automation avro==1.11.3 @@ -57,13 +57,13 @@ billiard==4.2.1 bleach==6.1.0 blinker==1.8.2 bokeh==3.6.0 -boto3==1.35.46 -boto3-stubs-lite==1.35.46 -botocore==1.35.46 -botocore-stubs==1.35.46 +boto3==1.35.48 +boto3-stubs-lite==1.35.48 +botocore==1.35.48 +botocore-stubs==1.35.48 buildkite-test-collector==0.1.9 cachecontrol==0.14.0 -cached-property==1.5.2 +cached-property==2.0 cachelib==0.9.0 cachetools==5.5.0 caio==0.9.17 @@ -93,7 +93,7 @@ connexion==2.14.2 contourpy==1.3.0 coverage==7.6.4 cron-descriptor==1.4.5 -croniter==3.0.3 +croniter==3.0.4 cryptography==43.0.3 cssutils==2.11.1 cycler==0.12.1 @@ -180,7 +180,7 @@ dataproperty==1.0.1 db-dtypes==1.3.0 dbt-adapters==1.3.2 dbt-common==1.3.0 -dbt-core==1.8.7 +dbt-core==1.8.8 dbt-duckdb==1.9.0 -e examples/experimental/dagster-airlift/examples/dbt-example dbt-extractor==0.5.1 @@ -200,7 +200,7 @@ distributed==2024.10.0 distro==1.9.0 dlt==1.3.0 dnspython==2.7.0 -docker==5.0.3 +docker==7.1.0 docker-image-py==0.1.13 docker-pycreds==0.4.0 -e examples/docs_snippets @@ -229,7 +229,7 @@ flask-limiter==3.8.0 flask-login==0.6.3 flask-session==0.5.0 flask-sqlalchemy==2.5.1 -flask-wtf==1.2.1 +flask-wtf==1.2.2 flatbuffers==24.3.25 fonttools==4.54.1 fqdn==1.5.1 @@ -255,7 +255,7 @@ graphene==3.4 graphql-core==3.2.5 graphql-relay==3.2.0 graphviz==0.20.3 -great-expectations==0.17.11 +great-expectations==0.18.21 grpcio==1.67.0 grpcio-health-checking==1.62.3 grpcio-status==1.62.3 @@ -270,7 +270,7 @@ httptools==0.6.4 httpx==0.27.2 humanfriendly==10.0 humanize==4.11.0 -hypothesis==6.115.3 +hypothesis==6.115.5 idna==3.10 ijson==3.3.0 imagesize==1.4.1 @@ -279,7 +279,7 @@ importlib-resources==6.4.5 inflection==0.5.1 iniconfig==2.0.0 ipykernel==6.29.5 -ipython==8.28.0 +ipython==8.29.0 ipython-genutils==0.2.0 ipywidgets==8.1.5 iso8601==2.1.0 @@ -339,7 +339,7 @@ markupsafe==3.0.2 marshmallow==3.23.0 marshmallow-oneofschema==3.1.1 marshmallow-sqlalchemy==0.26.1 -mashumaro==3.13.1 +mashumaro==3.14 matplotlib==3.9.2 matplotlib-inline==0.1.3 mbstrdecoder==1.1.3 @@ -359,7 +359,7 @@ msal-extensions==1.2.0 msgpack==1.1.0 multidict==6.1.0 multimethod==1.10 -mypy-boto3-ecs==1.35.43 +mypy-boto3-ecs==1.35.48 mypy-boto3-emr==1.35.39 mypy-boto3-emr-serverless==1.35.25 mypy-boto3-glue==1.35.25 @@ -385,7 +385,7 @@ objgraph==3.6.2 onnx==1.17.0 onnxconverter-common==1.13.0 onnxruntime==1.19.2 -openai==1.52.1 +openai==1.52.2 openapi-schema-validator==0.6.2 openapi-spec-validator==0.7.1 opentelemetry-api==1.27.0 @@ -426,7 +426,7 @@ platformdirs==4.3.6 plotly==5.24.1 pluggy==1.5.0 ply==3.11 -polars==1.10.0 +polars==1.11.0 portalocker==2.10.1 prison==0.2.1 progressbar2==4.5.0 @@ -469,7 +469,7 @@ pytest-cov==5.0.0 pytest-mock==3.14.0 pytest-rerunfailures==14.0 pytest-xdist==3.6.1 -python-daemon==3.0.1 +python-daemon==3.1.0 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-frontmatter==1.1.0 @@ -486,7 +486,7 @@ pytzdata==2020.1 pyyaml==6.0.2 pyzmq==26.2.0 querystring-parser==1.2.4 -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 readme-renderer==44.0 referencing==0.35.1 regex==2024.9.11 @@ -503,8 +503,9 @@ rich==13.9.3 rich-argparse==1.5.2 rpds-py==0.20.0 rsa==4.9 -ruamel-yaml==0.17.17 -ruff==0.7.0 +ruamel-yaml==0.17.40 +ruamel-yaml-clib==0.2.12 +ruff==0.7.1 s3transfer==0.10.3 scikit-learn==1.5.2 scipy==1.14.1 @@ -522,13 +523,13 @@ simplejson==3.19.3 six==1.16.0 skein==0.8.2 skl2onnx==1.17.0 -slack-sdk==3.33.1 -sling==1.2.21 -sling-mac-arm64==1.2.21 +slack-sdk==3.33.2 +sling==1.2.22 +sling-mac-arm64==1.2.22 smmap==5.0.1 sniffio==1.3.1 snowballstemmer==2.2.0 -snowflake-connector-python==3.12.2 +snowflake-connector-python==3.12.3 snowflake-sqlalchemy==1.6.1 sortedcontainers==2.4.0 soupsieve==2.6 @@ -567,7 +568,7 @@ terminado==0.18.1 text-unidecode==1.3 threadpoolctl==3.5.0 tiktoken==0.8.0 -tinycss2==1.3.0 +tinycss2==1.4.0 toml==0.10.2 tomli==2.0.2 tomlkit==0.13.2 @@ -583,12 +584,12 @@ trio==0.27.0 trio-websocket==0.11.1 -e examples/experimental/dagster-airlift/examples/tutorial-example -e examples/tutorial_notebook_assets -twilio==9.3.4 +twilio==9.3.6 twine==1.15.0 typeguard==4.3.0 typepy==1.3.2 typer==0.12.5 -types-awscrt==0.22.4 +types-awscrt==0.23.0 types-backports==0.1.3 types-certifi==2021.10.8.3 types-cffi==1.16.0.20240331 @@ -604,7 +605,7 @@ types-pytz==2024.2.0.20241003 types-pyyaml==6.0.12.20240917 types-requests==2.32.0.20241016 types-s3transfer==0.10.3 -types-setuptools==75.2.0.20241019 +types-setuptools==75.2.0.20241025 types-simplejson==3.19.0.20240801 types-six==1.16.21.20241009 types-sqlalchemy==1.4.53.34 diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/fail.csv b/python_modules/libraries/dagster-ge/dagster_ge/examples/fail.csv deleted file mode 100644 index 1085b62977933..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/fail.csv +++ /dev/null @@ -1,32 +0,0 @@ -"Team", "Payroll (millions)", "Wins" -"Nationals", 81000000000.34, 980000000 -"Reds", 82.20, -97 -"Yankees", 197.96, 95 -"Giants", 117.62, 940 -"Braves", 83.31, 94 -"Athletics", 55.37, 94 -"Rangers", 120.51, 93 -"Orioles", 81.43, 9999999999 -"Rays", 64.17, 90 -"Angels", 154.49, -89 -"Tigers", 132.30, 88 -"Cardinals", 1100000000000000.30, 88 -"Dodgers", 95.14, 86 -"White Sox", 96.92, 85 -"Brewers", 97.65, -83 -"Phillies", 174.54, 81 -"Diamondbacks", 74000000.28, 81 -"Pirates", 63.43, 79 -"Padres", 55.24, 76000000 -"Mariners", 81.97, 75 -"Mets", 93.35, 74 -"Blue Jays", 75.48, 73 -"Royals", 60.91, 72 -"Marlins", 118.07, 69 -"Red Sox", 173.18, 69 -"Indians", 78.43, 68 -"Twins", 94.08, 66 -"Rockies", 78.06, 64 -"Cubs", 88.19, 61 -"Astros", 60.65, 55 -"WHOKNOWS", -99999999, NaN \ No newline at end of file diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/ge_demo.py b/python_modules/libraries/dagster-ge/dagster_ge/examples/ge_demo.py deleted file mode 100644 index 24aab1a80b560..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/ge_demo.py +++ /dev/null @@ -1,47 +0,0 @@ -from dagster import job, op -from dagster._utils import file_relative_path -from dagster_ge.factory import ge_data_context, ge_validation_op_factory -from pandas import read_csv - - -@op -def read_in_datafile(csv_path): - return read_csv(csv_path) - - -@op -def process_payroll(df): - return len(df) - - -@op -def postprocess_payroll(numrows, expectation): - if expectation["success"]: - return numrows - else: - raise ValueError - - -payroll_expectations = ge_validation_op_factory( - name="ge_validation_op", datasource_name="getest", suite_name="basic.warning" -) - - -@job( - resource_defs={"ge_data_context": ge_data_context}, - config={ - "resources": { - "ge_data_context": { - "config": {"ge_root_dir": file_relative_path(__file__, "./great_expectations")} - } - }, - "solids": { - "read_in_datafile": { - "inputs": {"csv_path": {"value": file_relative_path(__file__, "./succeed.csv")}} - } - }, - }, -) -def payroll_data(): - output_df = read_in_datafile() - postprocess_payroll(process_payroll(output_df), payroll_expectations(output_df)) diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/.gitignore b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/.gitignore deleted file mode 100644 index 9052951bb146c..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -uncommitted/.ipynbcheckpoints/ -uncommitted/data_docs/ -uncommitted/validations/ \ No newline at end of file diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/great_expectations.yml b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/great_expectations.yml deleted file mode 100644 index 984bca7e782ac..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/great_expectations.yml +++ /dev/null @@ -1,124 +0,0 @@ -# Welcome to Great Expectations! Always know what to expect from your data. -# -# Here you can define datasources, batch kwargs generators, integrations and -# more. This file is intended to be committed to your repo. For help with -# configuration please: -# - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration -# - Join our slack channel: http://greatexpectations.io/slack - -config_version: 2.0 - -# Datasources tell Great Expectations where your data lives and how to get it. -# You can use the CLI command `great_expectations datasource new` to help you -# add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html -datasources: - getest: - batch_kwargs_generators: - subdir_reader: - class_name: SubdirReaderBatchKwargsGenerator - base_directory: ..\. - class_name: PandasDatasource - module_name: great_expectations.datasource - data_asset_type: - class_name: PandasDataset - module_name: great_expectations.dataset - -# This config file supports variable substitution which enables: 1) keeping -# secrets out of source control & 2) environment-based configuration changes -# such as staging vs prod. -# -# When GE encounters substitution syntax (like `my_key: ${my_value}` or -# `my_key: $my_value`) in the great_expectations.yml file, it will attempt -# to replace the value of `my_key` with the value from an environment -# variable `my_value` or a corresponding key read from this config file, -# which is defined through the `config_variables_file_path`. -# Environment variables take precedence over variables defined here. -# -# Substitution values defined here can be a simple (non-nested) value, -# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) -# -# -# https://docs.greatexpectations.io/en/latest/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html - -config_variables_file_path: uncommitted/config_variables.yml - -# The plugins_directory will be added to your python path for custom modules -# used to override and extend Great Expectations. -plugins_directory: plugins/ - -# Validation Operators are customizable workflows that bundle the validation of -# one or more expectation suites and subsequent actions. The example below -# stores validations and send a slack notification. To read more about -# customizing and extending these, read: https://docs.greatexpectations.io/en/latest/features/validation_operators_and_actions.html -validation_operators: - action_list_operator: - # To learn how to configure sending Slack notifications during evaluation - # (and other customizations), read: https://docs.greatexpectations.io/en/latest/reference/validation_operators/action_list_validation_operator.html - class_name: ActionListValidationOperator - action_list: - - name: store_validation_result - action: - class_name: StoreValidationResultAction - - name: store_evaluation_params - action: - class_name: StoreEvaluationParametersAction - - name: update_data_docs - action: - class_name: UpdateDataDocsAction - # - name: send_slack_notification_on_validation_result - # action: - # class_name: SlackNotificationAction - # # put the actual webhook URL in the uncommitted/config_variables.yml file - # slack_webhook: ${validation_notification_slack_webhook} - # notify_on: all # possible values: "all", "failure", "success" - # renderer: - # module_name: great_expectations.render.renderer.slack_renderer - # class_name: SlackRenderer - -stores: - # Stores are configurable places to store things like Expectations, Validations - # Data Docs, and more. These are for advanced users only - most users can simply - # leave this section alone. - # - # Three stores are required: expectations, validations, and - # evaluation_parameters, and must exist with a valid store entry. Additional - # stores can be configured for uses such as data_docs, validation_operators, etc. - expectations_store: - class_name: ExpectationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: expectations/ - - validations_store: - class_name: ValidationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/validations/ - - evaluation_parameter_store: - # Evaluation Parameters enable dynamic expectations. Read more here: - # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html - class_name: EvaluationParameterStore - -expectations_store_name: expectations_store -validations_store_name: validations_store -evaluation_parameter_store_name: evaluation_parameter_store - -data_docs_sites: - # Data Docs make it simple to visualize data quality in your project. These - # include Expectations, Validations & Profiles. The are built for all - # Datasources from JSON artifacts in the local repo including validations & - # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/features/data_docs.html - local_site: - class_name: SiteBuilder - # set to false to hide how-to buttons in Data Docs - show_how_to_buttons: true - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/data_docs/local_site/ - site_index_builder: - class_name: DefaultSiteIndexBuilder - -anonymous_usage_statistics: - data_context_id: 9656cbae-5408-4731-a54c-439358114e8f - enabled: true diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/pandas/validation_playground.ipynb b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/pandas/validation_playground.ipynb deleted file mode 100644 index 61c5c58a18a1e..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/pandas/validation_playground.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type PandasDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"PandasDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate a file on a filesystem:\n", - "batch_kwargs = {\"path\": \"YOUR_FILE_PATH\", \"datasource\": datasource_name}\n", - "\n", - "# If you already loaded the data into a Pandas Data Frame:\n", - "batch_kwargs = {\"dataset\": \"YOUR_DATAFRAME\", \"datasource\": datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/spark/validation_playground.ipynb b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/spark/validation_playground.ipynb deleted file mode 100644 index 555c4dd32b252..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/spark/validation_playground.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type SparkDFDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"SparkDFDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate a file on a filesystem:\n", - "batch_kwargs = {\"path\": \"YOUR_FILE_PATH\", \"datasource\": datasource_name}\n", - "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n", - "\n", - "# If you already loaded the data into a PySpark Data Frame:\n", - "batch_kwargs = {\"dataset\": \"YOUR_DATAFRAME\", \"datasource\": datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/sql/validation_playground.ipynb b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/sql/validation_playground.ipynb deleted file mode 100644 index 17e21e1308c4f..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/notebooks/sql/validation_playground.ipynb +++ /dev/null @@ -1,246 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type SqlAlchemyDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"SqlAlchemyDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate an entire table or view in your database's default schema:\n", - "batch_kwargs = {\"table\": \"YOUR_TABLE\", \"datasource\": datasource_name}\n", - "\n", - "# If you would like to validate an entire table or view from a non-default schema in your database:\n", - "batch_kwargs = {\"table\": \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", \"datasource\": datasource_name}\n", - "\n", - "# If you would like to validate the result set of a query:\n", - "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/uncommitted/config_variables.yml b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/uncommitted/config_variables.yml deleted file mode 100644 index dc196f7486ca2..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/uncommitted/config_variables.yml +++ /dev/null @@ -1,18 +0,0 @@ -# This config file supports variable substitution which enables: 1) keeping -# secrets out of source control & 2) environment-based configuration changes -# such as staging vs prod. -# -# When GE encounters substitution syntax (like `my_key: ${my_value}` or -# `my_key: $my_value`) in the great_expectations.yml file, it will attempt -# to replace the value of `my_key` with the value from an environment -# variable `my_value` or a corresponding key read from this config file, -# which is defined through the `config_variables_file_path`. -# Environment variables take precedence over variables defined here. -# -# Substitution values defined here can be a simple (non-nested) value, -# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) -# -# -# https://docs.greatexpectations.io/en/latest/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html - -instance_id: bd20cf38-7fa3-4477-96ed-fad020607cec diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/uncommitted/edit_basic.warning.ipynb b/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/uncommitted/edit_basic.warning.ipynb deleted file mode 100644 index 8a34a62819abd..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/uncommitted/edit_basic.warning.ipynb +++ /dev/null @@ -1,282 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Edit Your Expectation Suite\n", - "Use this notebook to recreate and modify your expectation suite:\n", - "\n", - "**Expectation Suite Name**: `basic.warning`\n", - "\n", - "We'd love it if you **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2020-06-24T09:51:24-0400 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.\n" - ] - }, - { - "ename": "DataContextError", - "evalue": "expectation_suite basic.warning not found", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mDataContextError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;31m# remove the other one.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[0mexpectation_suite_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"basic.warning\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[0msuite\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_expectation_suite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexpectation_suite_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 16\u001b[0m \u001b[0msuite\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexpectations\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\users\\leorf\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\great_expectations\\data_context\\data_context.py\u001b[0m in \u001b[0;36mget_expectation_suite\u001b[1;34m(self, expectation_suite_name)\u001b[0m\n\u001b[0;32m 1154\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1155\u001b[0m raise ge_exceptions.DataContextError(\n\u001b[1;32m-> 1156\u001b[1;33m \u001b[1;34m\"expectation_suite %s not found\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mexpectation_suite_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1157\u001b[0m )\n\u001b[0;32m 1158\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mDataContextError\u001b[0m: expectation_suite basic.warning not found" - ], - "output_type": "error" - } - ], - "source": [ - "import dagstermill\n", - "import json\n", - "import datetime\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.data_context.types.resource_identifiers import (\n", - " ValidationResultIdentifier,\n", - ")\n", - "\n", - "context = ge.data_context.DataContext()\n", - "\n", - "# Feel free to change the name of your suite here. Renaming this will not\n", - "# remove the other one.\n", - "expectation_suite_name = \"basic.warning\"\n", - "suite = context.get_expectation_suite(expectation_suite_name)\n", - "suite.expectations = []\n", - "\n", - "batch_kwargs = {\n", - " \"data_asset_name\": \"basic\",\n", - " \"datasource\": \"blink\",\n", - " \"path\": \"C:\\\\Users\\\\leorf\\\\PycharmProjects\\\\elementl\\\\dagster\\\\python_modules\\\\libraries\\\\dagster-ge\\\\great_expectations\\\\..\\\\.\\\\basic.csv\",\n", - "}\n", - "batch = context.get_batch(batch_kwargs, suite)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create & Edit Expectations\n", - "\n", - "Add expectations by calling specific expectation methods on the `batch` object. They all begin with `.expect_` which makes autocompleting easy using tab.\n", - "\n", - "You can see all the available expectations in the **[expectation glossary](https://docs.greatexpectations.io/en/latest/reference/glossary_of_expectations.html?utm_source=notebook&utm_medium=create_expectations)**." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Table Expectation(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_table_row_count_to_be_between(max_value=33, min_value=27)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_table_column_count_to_equal(value=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_table_columns_to_match_ordered_list(\n", - " column_list=[\"Team\", ' \"Payroll (millions)\"', ' \"Wins\"']\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Column Expectation(s)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### ` \"Payroll (millions)\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_values_to_not_be_null(' \"Payroll (millions)\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_min_to_be_between(' \"Payroll (millions)\"', max_value=56.24, min_value=54.24)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_max_to_be_between(' \"Payroll (millions)\"', max_value=198.96, min_value=196.96)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_mean_to_be_between(\n", - " ' \"Payroll (millions)\"', max_value=99.01899999999998, min_value=97.01899999999998\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_median_to_be_between(' \"Payroll (millions)\"', max_value=86.75, min_value=84.75)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_quantile_values_to_be_between(\n", - " ' \"Payroll (millions)\"',\n", - " allow_relative_error=False,\n", - " quantile_ranges={\n", - " \"quantiles\": [0.05, 0.25, 0.5, 0.75, 0.95],\n", - " \"value_ranges\": [\n", - " [54.37, 56.37],\n", - " [74.48, 76.48],\n", - " [82.31, 84.31],\n", - " [116.62, 118.62],\n", - " [173.54, 175.54],\n", - " ],\n", - " },\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `Team`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_values_to_not_be_null(\"Team\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_value_lengths_to_be_between(\"Team\", min_value=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save & Review Your Expectations\n", - "\n", - "Let's save the expectation suite as a JSON file in the `great_expectations/expectations` directory of your project.\n", - "If you decide not to save some expectations that you created, use [remove_expectation method](https://docs.greatexpectations.io/en/latest/module_docs/data_asset_module.html?highlight=remove_expectation&utm_source=notebook&utm_medium=edit_expectations#great_expectations.data_asset.data_asset.DataAsset.remove_expectation).\n", - "\n", - "Let's now rebuild your Data Docs, which helps you communicate about your data with both machines and humans." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.save_expectation_suite(discard_failed_expectations=False)\n", - "\n", - "\"\"\"\n", - "Let's create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")\n", - "dagstermill.yield_result(json.loads(str(results.list_validation_results())))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dagster", - "language": "python", - "name": "dagster" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/succeed.csv b/python_modules/libraries/dagster-ge/dagster_ge/examples/succeed.csv deleted file mode 100644 index a8be4a5dfacb2..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge/examples/succeed.csv +++ /dev/null @@ -1,31 +0,0 @@ -"Team", "Payroll (millions)", "Wins" -"Nationals", 81.34, 98 -"Reds", 82.20, 97 -"Yankees", 197.96, 95 -"Giants", 117.62, 94 -"Braves", 83.31, 94 -"Athletics", 55.37, 94 -"Rangers", 120.51, 93 -"Orioles", 81.43, 93 -"Rays", 64.17, 90 -"Angels", 154.49, 89 -"Tigers", 132.30, 88 -"Cardinals", 110.30, 88 -"Dodgers", 95.14, 86 -"White Sox", 96.92, 85 -"Brewers", 97.65, 83 -"Phillies", 174.54, 81 -"Diamondbacks", 74.28, 81 -"Pirates", 63.43, 79 -"Padres", 55.24, 76 -"Mariners", 81.97, 75 -"Mets", 93.35, 74 -"Blue Jays", 75.48, 73 -"Royals", 60.91, 72 -"Marlins", 118.07, 69 -"Red Sox", 173.18, 69 -"Indians", 78.43, 68 -"Twins", 94.08, 66 -"Rockies", 78.06, 64 -"Cubs", 88.19, 61 -"Astros", 60.65, 55 \ No newline at end of file diff --git a/python_modules/libraries/dagster-ge/dagster_ge/factory.py b/python_modules/libraries/dagster-ge/dagster_ge/factory.py index bd6df83edb636..bf42e6e8a5310 100644 --- a/python_modules/libraries/dagster-ge/dagster_ge/factory.py +++ b/python_modules/libraries/dagster-ge/dagster_ge/factory.py @@ -1,5 +1,5 @@ import datetime -from typing import Any, Dict +from typing import Any, Literal, Mapping, Optional import great_expectations as ge from dagster import ( @@ -8,6 +8,7 @@ IAttachDifferentObjectToOpContext, In, MetadataValue, + OpDefinition, OpExecutionContext, Out, Output, @@ -16,19 +17,14 @@ resource, ) from dagster._core.definitions.resource_definition import dagster_maintained_resource +from dagster._core.execution.context.init import InitResourceContext from dagster._core.storage.tags import COMPUTE_KIND_TAG +from dagster._core.types.dagster_type import DagsterType from dagster_pandas import DataFrame from great_expectations.render.renderer import ValidationResultsPageRenderer from great_expectations.render.view import DefaultMarkdownPageView from pydantic import Field -try: - # ge < v0.13.0 - from great_expectations.core import convert_to_json_serializable -except ImportError: - # ge >= v0.13.0 - from great_expectations.core.util import convert_to_json_serializable - class GEContextResource(ConfigurableResource, IAttachDifferentObjectToOpContext): ge_root_dir: str = Field( @@ -47,113 +43,22 @@ def get_object_to_set_on_execution_context(self): @dagster_maintained_resource @resource(config_schema=GEContextResource.to_config_schema()) -def ge_data_context(context): +def ge_data_context(context: InitResourceContext) -> GEContextResource: return GEContextResource.from_resource_context(context).get_data_context() def ge_validation_op_factory( - name, - datasource_name, - suite_name, - validation_operator_name=None, - input_dagster_type=DataFrame, - batch_kwargs=None, -): - """Generates ops for interacting with GE. - - Args: - name (str): the name of the op - datasource_name (str): the name of your DataSource, see your great_expectations.yml - suite_name (str): the name of your expectation suite, see your great_expectations.yml - validation_operator_name (Optional[str]): what validation operator to run -- defaults to - None, which generates an ephemeral validator. If you want to save data docs, use - 'action_list_operator'. - See https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html# - input_dagster_type (DagsterType): the Dagster type used to type check the input to the op. - Defaults to `dagster_pandas.DataFrame`. - batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the - `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where - `dataset` is the input to the generated op. - - Returns: - An op that takes in a set of data and yields both an expectation with relevant metadata - and an output with all the metadata (for user processing) - """ - check.str_param(datasource_name, "datasource_name") - check.str_param(suite_name, "suite_name") - check.opt_str_param(validation_operator_name, "validation_operator_name") - batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs") - - @op( - name=name, - ins={"dataset": In(input_dagster_type)}, - out=Out( - dict, - description=""" - This op yields an expectationResult with a structured dict of metadata from - the GE suite, as well as the full result in case a user wants to process it differently. - The structured dict contains both summary stats from the suite as well as expectation by - expectation results/details. - """, - ), - required_resource_keys={"ge_data_context"}, - tags={COMPUTE_KIND_TAG: "ge"}, - ) - def _ge_validation_fn(context: OpExecutionContext, dataset): - data_context = context.resources.ge_data_context - - if validation_operator_name is not None: - validation_operator = validation_operator_name - else: - data_context.add_validation_operator( - "ephemeral_validation", - {"class_name": "ActionListValidationOperator", "action_list": []}, - ) - validation_operator = "ephemeral_validation" - suite = data_context.get_expectation_suite(suite_name) - final_batch_kwargs = batch_kwargs or {"dataset": dataset} - if "datasource" in final_batch_kwargs: - context.log.warning( - "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` " - "parameter of the op factory instead." - ) - final_batch_kwargs["datasource"] = datasource_name - batch = data_context.get_batch(final_batch_kwargs, suite) - run_id = { - "run_name": datasource_name + " run", - "run_time": datetime.datetime.now(datetime.timezone.utc), - } - results = data_context.run_validation_operator( - validation_operator, assets_to_validate=[batch], run_id=run_id - ) - res = convert_to_json_serializable(results.list_validation_results())[0] - validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True) - rendered_document_content_list = ( - validation_results_page_renderer.render_validation_operator_result(results) - ) - md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list)) - - yield ExpectationResult( - success=res["success"], - metadata={"Expectation Results": MetadataValue.md(md_str)}, - ) - yield Output(res) - - return _ge_validation_fn - - -def ge_validation_op_factory_v3( - name, - datasource_name, - data_connector_name, - data_asset_name, - suite_name, + name: str, + datasource_name: str, + data_connector_name: str, + data_asset_name: str, + suite_name: str, batch_identifiers: dict, - input_dagster_type=DataFrame, - runtime_method_type="batch_data", - extra_kwargs=None, -): - """Generates ops for interacting with GE (v3 API). + input_dagster_type: DagsterType = DataFrame, # default to pandas support + runtime_method_type: Literal["batch_data", "path", "query"] = "batch_data", + extra_kwargs: Optional[Mapping[str, Any]] = None, +) -> OpDefinition: + """Generates ops for interacting with Great Expectations. Args: name (str): the name of the op @@ -173,16 +78,19 @@ def ge_validation_op_factory_v3( in-memory object. extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s `get_validator` method. If not set, input will be: - { - "datasource_name": datasource_name, - "data_connector_name": data_connector_name, - "data_asset_name": data_asset_name, - "runtime_parameters": { - "": - }, - "batch_identifiers": batch_identifiers, - "expectation_suite_name": suite_name, - } + + :: + + { + "datasource_name": datasource_name, + "data_connector_name": data_connector_name, + "data_asset_name": data_asset_name, + "runtime_parameters": { + "": + }, + "batch_identifiers": batch_identifiers, + "expectation_suite_name": suite_name, + } Returns: An op that takes in a set of data and yields both an expectation with relevant metadata and @@ -193,7 +101,7 @@ def ge_validation_op_factory_v3( check.str_param(data_connector_name, "data_connector_name") check.str_param(suite_name, "suite_name") - _extra_kwargs: Dict[Any, Any] = check.opt_dict_param(extra_kwargs, "extra_kwargs") + _extra_kwargs = check.opt_mapping_param(extra_kwargs, "extra_kwargs") @op( name=name, @@ -213,16 +121,15 @@ def ge_validation_op_factory_v3( def _ge_validation_fn(context: OpExecutionContext, dataset): data_context = context.resources.ge_data_context - validator_kwargs = { - "datasource_name": datasource_name, - "data_connector_name": data_connector_name, - "data_asset_name": datasource_name or data_asset_name, - "runtime_parameters": {runtime_method_type: dataset}, - "batch_identifiers": batch_identifiers, - "expectation_suite_name": suite_name, + validator = data_context.get_validator( + datasource_name=datasource_name, + data_connector_name=data_connector_name, + data_asset_name=datasource_name or data_asset_name, + runtime_parameters={runtime_method_type: dataset}, + batch_identifiers=batch_identifiers, + expectation_suite_name=suite_name, **_extra_kwargs, - } - validator = data_context.get_validator(**validator_kwargs) + ) run_id = { "run_name": datasource_name + " run", diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_resources.ambr b/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_resources.ambr deleted file mode 100644 index e6313510e0725..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_resources.ambr +++ /dev/null @@ -1,247 +0,0 @@ -# serializer version: 1 -# name: test_yielded_results_config_pandas[hello_world_pandas_job_v2-./great_expectations] - ''' - - # Validation Results - - - - - ## Overview - ### **Expectation Suite:** **basic.warning** - **Data asset:** **None** - **Status:** **Succeeded** - - - - - - ### Statistics - - - - - - - | | | - | ------------ | ------------ | - Evaluated Expectations | 11 - Successful Expectations | 11 - Unsuccessful Expectations | 0 - Success Percent | 100% - - - - - - ## Table-Level Expectations - - - - - - - - - | Status | Expectation | Observed Value | - | ------------ | ------------ | ------------ | - ✅ | Must have greater than or equal to **27** and less than or equal to **33** rows. | 30 - ✅ | Must have exactly **3** columns. | 3 - ✅ | Must have these columns in this order: **Team**, ** "Payroll (millions)"**, ** "Wins"** | ['Team', ' "Payroll (millions)"', ' "Wins"'] - - - - - - ## "Payroll (millions)" - - - - - - - - - | Status | Expectation | Observed Value | - | ------------ | ------------ | ------------ | - ✅ | values must never be null. | 100% not null - ✅ | minimum value must be greater than or equal to **54.24** and less than or equal to **56.24**. | 55.24 - ✅ | maximum value must be greater than or equal to **196.96** and less than or equal to **198.96**. | 197.96 - ✅ | mean must be greater than or equal to **97.01899999999998** and less than or equal to **99.01899999999998**. | ≈98.019 - ✅ | median must be greater than or equal to **84.75** and less than or equal to **86.75**. | 85.75 - ✅ | quantiles must be within the following value ranges. - - - - - | Quantile | Min Value | Max Value | - | ------------ | ------------ | ------------ | - 0.05 | 54.37 | 56.37 - Q1 | 74.48 | 76.48 - Median | 82.31 | 84.31 - Q3 | 116.62 | 118.62 - 0.95 | 173.54 | 175.54 - | - - - - - | Quantile | Value | - | ------------ | ------------ | - 0.05 | 55.37 - Q1 | 75.48 - Median | 83.31 - Q3 | 117.62 - 0.95 | 174.54 - - - - - - - ## Team - - - - - - - - - | Status | Expectation | Observed Value | - | ------------ | ------------ | ------------ | - ✅ | values must never be null. | 100% not null - ✅ | values must always be greater than or equal to **1** characters long. | 0% unexpected - - - - - - - - - ''' -# --- -# name: test_yielded_results_config_pandas[hello_world_pandas_job_v3-./great_expectations_v3] - ''' - - # Validation Results - - - - - ## Overview - ### **Expectation Suite:** **basic.warning** - **Data asset:** **getest** - **Status:** **Succeeded** - - - - - - ### Statistics - - - - - - - | | | - | ------------ | ------------ | - Evaluated Expectations | 11 - Successful Expectations | 11 - Unsuccessful Expectations | 0 - Success Percent | 100% - - - - - - ## Table-Level Expectations - - - - - - - - - | Status | Expectation | Observed Value | - | ------------ | ------------ | ------------ | - ✅ | Must have greater than or equal to **27** and less than or equal to **33** rows. | 30 - ✅ | Must have exactly **3** columns. | 3 - ✅ | Must have these columns in this order: **Team**, ** "Payroll (millions)"**, ** "Wins"** | ['Team', ' "Payroll (millions)"', ' "Wins"'] - - - - - - ## "Payroll (millions)" - - - - - - - - - | Status | Expectation | Observed Value | - | ------------ | ------------ | ------------ | - ✅ | values must never be null. | 100% not null - ✅ | minimum value must be greater than or equal to **54.24** and less than or equal to **56.24**. | 55.24 - ✅ | maximum value must be greater than or equal to **196.96** and less than or equal to **198.96**. | 197.96 - ✅ | mean must be greater than or equal to **97.01899999999998** and less than or equal to **99.01899999999998**. | ≈98.019 - ✅ | median must be greater than or equal to **84.75** and less than or equal to **86.75**. | 85.75 - ✅ | quantiles must be within the following value ranges. - - - - - | Quantile | Min Value | Max Value | - | ------------ | ------------ | ------------ | - 0.05 | 54.37 | 56.37 - Q1 | 74.48 | 76.48 - Median | 82.31 | 84.31 - Q3 | 116.62 | 118.62 - 0.95 | 173.54 | 175.54 - | - - - - - | Quantile | Value | - | ------------ | ------------ | - 0.05 | 55.37 - Q1 | 75.48 - Median | 83.31 - Q3 | 117.62 - 0.95 | 174.54 - - - - - - - ## Team - - - - - - - - - | Status | Expectation | Observed Value | - | ------------ | ------------ | ------------ | - ✅ | values must never be null. | 100% not null - ✅ | values must always be greater than or equal to **1** characters long. | 0% unexpected - - - - - - - - - ''' -# --- diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_basic_integ.ambr b/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_validation.ambr similarity index 73% rename from python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_basic_integ.ambr rename to python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_validation.ambr index eb00158914dc7..942f625147a76 100644 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_basic_integ.ambr +++ b/python_modules/libraries/dagster-ge/dagster_ge_tests/__snapshots__/test_validation.ambr @@ -1,5 +1,5 @@ # serializer version: 1 -# name: test_yielded_results_config_pandas[hello_world_pandas_job_v2-./great_expectations] +# name: test_ge_validation[pandas-new] ''' # Validation Results @@ -9,7 +9,7 @@ ## Overview ### **Expectation Suite:** **basic.warning** - **Data asset:** **None** + **Data asset:** **getest** **Status:** **Succeeded** @@ -122,7 +122,7 @@ ''' # --- -# name: test_yielded_results_config_pandas[hello_world_pandas_job_v3-./great_expectations_v3] +# name: test_ge_validation[pandas-old] ''' # Validation Results @@ -245,7 +245,130 @@ ''' # --- -# name: test_yielded_results_config_pyspark_v2 +# name: test_ge_validation[pyspark-new] + ''' + + # Validation Results + + + + + ## Overview + ### **Expectation Suite:** **basic.warning** + **Data asset:** **getestspark** + **Status:** **Succeeded** + + + + + + ### Statistics + + + + + + + | | | + | ------------ | ------------ | + Evaluated Expectations | 11 + Successful Expectations | 11 + Unsuccessful Expectations | 0 + Success Percent | 100% + + + + + + ## Table-Level Expectations + + + + + + + + + | Status | Expectation | Observed Value | + | ------------ | ------------ | ------------ | + ✅ | Must have greater than or equal to **27** and less than or equal to **33** rows. | 30 + ✅ | Must have exactly **3** columns. | 3 + ✅ | Must have these columns in this order: **Team**, ** "Payroll (millions)"**, ** "Wins"** | ['Team', ' "Payroll (millions)"', ' "Wins"'] + + + + + + ## "Payroll (millions)" + + + + + + + + + | Status | Expectation | Observed Value | + | ------------ | ------------ | ------------ | + ✅ | values must never be null. | 100% not null + ✅ | minimum value must be greater than or equal to **54.24** and less than or equal to **56.24**. | 55.24 + ✅ | maximum value must be greater than or equal to **196.96** and less than or equal to **198.96**. | 197.96 + ✅ | mean must be greater than or equal to **97.01899999999998** and less than or equal to **99.01899999999998**. | 98.019 + ✅ | median must be greater than or equal to **84.75** and less than or equal to **86.75**. | 85.75 + ✅ | quantiles must be within the following value ranges. + + + + + | Quantile | Min Value | Max Value | + | ------------ | ------------ | ------------ | + 0.05 | 54.37 | 56.37 + Q1 | 74.48 | 76.48 + Median | 82.31 | 84.31 + Q3 | 116.62 | 118.62 + 0.95 | 173.54 | 175.54 + | + + + + + | Quantile | Value | + | ------------ | ------------ | + 0.05 | 55.37 + Q1 | 75.48 + Median | 83.31 + Q3 | 117.62 + 0.95 | 174.54 + + + + + + + ## Team + + + + + + + + + | Status | Expectation | Observed Value | + | ------------ | ------------ | ------------ | + ✅ | values must never be null. | 100% not null + ✅ | values must always be greater than or equal to **1** characters long. | 0% unexpected + + + + + + + + + ''' +# --- +# name: test_ge_validation[pyspark-old] ''' # Validation Results @@ -255,7 +378,7 @@ ## Overview ### **Expectation Suite:** **basic.warning** - **Data asset:** **None** + **Data asset:** **getestspark** **Status:** **Succeeded** diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/expectations/.ge_store_backend_id b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/expectations/.ge_store_backend_id deleted file mode 100644 index d52739cbcff45..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/expectations/.ge_store_backend_id +++ /dev/null @@ -1 +0,0 @@ -store_backend_id = 9656cbae-5408-4731-a54c-439358114e8f diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/expectations/basic/warning.json b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/expectations/basic/warning.json deleted file mode 100644 index 4ac8f88e2b573..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/expectations/basic/warning.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "data_asset_type": "Dataset", - "expectation_suite_name": "basic.warning", - "expectations": [ - { - "expectation_type": "expect_table_row_count_to_be_between", - "kwargs": { - "max_value": 33, - "min_value": 27 - }, - "meta": {} - }, - { - "expectation_type": "expect_table_column_count_to_equal", - "kwargs": { - "value": 3 - }, - "meta": {} - }, - { - "expectation_type": "expect_table_columns_to_match_ordered_list", - "kwargs": { - "column_list": [ - "Team", - " \"Payroll (millions)\"", - " \"Wins\"" - ] - }, - "meta": {} - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": " \"Payroll (millions)\"" - }, - "meta": {} - }, - { - "expectation_type": "expect_column_min_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 56.24, - "min_value": 54.24 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_max_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 198.96, - "min_value": 196.96 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_mean_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 99.01899999999998, - "min_value": 97.01899999999998 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_median_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 86.75, - "min_value": 84.75 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_quantile_values_to_be_between", - "kwargs": { - "allow_relative_error": false, - "column": " \"Payroll (millions)\"", - "quantile_ranges": { - "quantiles": [ - 0.05, - 0.25, - 0.5, - 0.75, - 0.95 - ], - "value_ranges": [ - [ - 54.37, - 56.37 - ], - [ - 74.48, - 76.48 - ], - [ - 82.31, - 84.31 - ], - [ - 116.62, - 118.62 - ], - [ - 173.54, - 175.54 - ] - ] - } - }, - "meta": {} - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "Team" - }, - "meta": {} - }, - { - "expectation_type": "expect_column_value_lengths_to_be_between", - "kwargs": { - "column": "Team", - "min_value": 1 - }, - "meta": {} - } - ], - "meta": { - "BasicSuiteBuilderProfiler": { - "batch_kwargs": { - "data_asset_name": "basic", - "datasource": "blink", - "path": "C:\\Users\\leorf\\PycharmProjects\\elementl\\dagster\\python_modules\\libraries\\dagster-ge\\great_expectations\\..\\.\\basic.csv" - }, - "created_at": 1592942378.2444508, - "created_by": "BasicSuiteBuilderProfiler" - }, - "citations": [ - { - "batch_kwargs": { - "data_asset_name": "basic", - "datasource": "blink", - "path": "C:\\Users\\leorf\\PycharmProjects\\elementl\\dagster\\python_modules\\libraries\\dagster-ge\\great_expectations\\..\\.\\basic.csv" - }, - "batch_markers": { - "ge_load_time": "20200623T195938.167653Z", - "pandas_data_fingerprint": "8c46fdaf0bd356fd58b7bcd9b2e6012d" - }, - "batch_parameters": null, - "citation_date": "20200623T195938.274371Z", - "comment": "BasicSuiteBuilderProfiler added a citation based on the current batch." - } - ], - "columns": { - " \"Payroll (millions)\"": { - "description": "" - }, - " \"Wins\"": { - "description": "" - }, - "Team": { - "description": "" - } - }, - "great_expectations.__version__": "0.11.5", - "notes": { - "content": [ - "#### This is an _example_ suite\n\n- This suite was made by quickly glancing at 1000 rows of your data.\n- This is **not a production suite**. It is meant to show examples of expectations.\n- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful.\n" - ], - "format": "markdown" - } - } -} \ No newline at end of file diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/great_expectations.yml b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/great_expectations.yml deleted file mode 100644 index a3f56ea1ecbe7..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/great_expectations.yml +++ /dev/null @@ -1,134 +0,0 @@ -# Welcome to Great Expectations! Always know what to expect from your data. -# -# Here you can define datasources, batch kwargs generators, integrations and -# more. This file is intended to be committed to your repo. For help with -# configuration please: -# - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration -# - Join our slack channel: http://greatexpectations.io/slack - -config_version: 2.0 - -# Datasources tell Great Expectations where your data lives and how to get it. -# You can use the CLI command `great_expectations datasource new` to help you -# add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html -datasources: - getest: - batch_kwargs_generators: - subdir_reader: - class_name: SubdirReaderBatchKwargsGenerator - base_directory: ..\. - class_name: PandasDatasource - module_name: great_expectations.datasource - data_asset_type: - class_name: PandasDataset - module_name: great_expectations.dataset - getestspark: - batch_kwargs_generators: - subdir_reader: - class_name: SubdirReaderBatchKwargsGenerator - base_directory: ..\. - class_name: SparkDFDatasource - module_name: great_expectations.datasource - data_asset_type: - class_name: SparkDFDataset - module_name: great_expectations.dataset - -# This config file supports variable substitution which enables: 1) keeping -# secrets out of source control & 2) environment-based configuration changes -# such as staging vs prod. -# -# When GE encounters substitution syntax (like `my_key: ${my_value}` or -# `my_key: $my_value`) in the great_expectations.yml file, it will attempt -# to replace the value of `my_key` with the value from an environment -# variable `my_value` or a corresponding key read from this config file, -# which is defined through the `config_variables_file_path`. -# Environment variables take precedence over variables defined here. -# -# Substitution values defined here can be a simple (non-nested) value, -# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) -# -# -# https://docs.greatexpectations.io/en/latest/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html - -config_variables_file_path: uncommitted/config_variables.yml - -# The plugins_directory will be added to your python path for custom modules -# used to override and extend Great Expectations. -plugins_directory: plugins/ - -# Validation Operators are customizable workflows that bundle the validation of -# one or more expectation suites and subsequent actions. The example below -# stores validations and send a slack notification. To read more about -# customizing and extending these, read: https://docs.greatexpectations.io/en/latest/features/validation_operators_and_actions.html -validation_operators: - action_list_operator: - # To learn how to configure sending Slack notifications during evaluation - # (and other customizations), read: https://docs.greatexpectations.io/en/latest/reference/validation_operators/action_list_validation_operator.html - class_name: ActionListValidationOperator - action_list: - - name: store_validation_result - action: - class_name: StoreValidationResultAction - - name: store_evaluation_params - action: - class_name: StoreEvaluationParametersAction - - name: update_data_docs - action: - class_name: UpdateDataDocsAction - # - name: send_slack_notification_on_validation_result - # action: - # class_name: SlackNotificationAction - # # put the actual webhook URL in the uncommitted/config_variables.yml file - # slack_webhook: ${validation_notification_slack_webhook} - # notify_on: all # possible values: "all", "failure", "success" - # renderer: - # module_name: great_expectations.render.renderer.slack_renderer - # class_name: SlackRenderer - -stores: - # Stores are configurable places to store things like Expectations, Validations - # Data Docs, and more. These are for advanced users only - most users can simply - # leave this section alone. - # - # Three stores are required: expectations, validations, and - # evaluation_parameters, and must exist with a valid store entry. Additional - # stores can be configured for uses such as data_docs, validation_operators, etc. - expectations_store: - class_name: ExpectationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: expectations/ - - validations_store: - class_name: ValidationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/validations/ - - evaluation_parameter_store: - # Evaluation Parameters enable dynamic expectations. Read more here: - # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html - class_name: EvaluationParameterStore - -expectations_store_name: expectations_store -validations_store_name: validations_store -evaluation_parameter_store_name: evaluation_parameter_store - -data_docs_sites: - # Data Docs make it simple to visualize data quality in your project. These - # include Expectations, Validations & Profiles. The are built for all - # Datasources from JSON artifacts in the local repo including validations & - # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/features/data_docs.html - local_site: - class_name: SiteBuilder - # set to false to hide how-to buttons in Data Docs - show_how_to_buttons: true - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/data_docs/local_site/ - site_index_builder: - class_name: DefaultSiteIndexBuilder - -anonymous_usage_statistics: - data_context_id: 9656cbae-5408-4731-a54c-439358114e8f - enabled: true diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/pandas/validation_playground.ipynb b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/pandas/validation_playground.ipynb deleted file mode 100644 index 61c5c58a18a1e..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/pandas/validation_playground.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type PandasDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"PandasDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate a file on a filesystem:\n", - "batch_kwargs = {\"path\": \"YOUR_FILE_PATH\", \"datasource\": datasource_name}\n", - "\n", - "# If you already loaded the data into a Pandas Data Frame:\n", - "batch_kwargs = {\"dataset\": \"YOUR_DATAFRAME\", \"datasource\": datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/spark/validation_playground.ipynb b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/spark/validation_playground.ipynb deleted file mode 100644 index 555c4dd32b252..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/spark/validation_playground.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type SparkDFDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"SparkDFDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate a file on a filesystem:\n", - "batch_kwargs = {\"path\": \"YOUR_FILE_PATH\", \"datasource\": datasource_name}\n", - "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n", - "\n", - "# If you already loaded the data into a PySpark Data Frame:\n", - "batch_kwargs = {\"dataset\": \"YOUR_DATAFRAME\", \"datasource\": datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/sql/validation_playground.ipynb b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/sql/validation_playground.ipynb deleted file mode 100644 index 17e21e1308c4f..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/notebooks/sql/validation_playground.ipynb +++ /dev/null @@ -1,246 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validation Playground\n", - "\n", - "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", - "\n", - "#### This notebook assumes that you created at least one expectation suite in your project.\n", - "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", - "\n", - "\n", - "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.datasource.types import BatchKwargs\n", - "import datetime" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Get a DataContext\n", - "This represents your **project** that you just created using `great_expectations init`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context = ge.data_context.DataContext()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Choose an Expectation Suite\n", - "\n", - "List expectation suites that you created in your project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.list_expectation_suite_names()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "expectation_suite_name = None # TODO: set to a name from the list above" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load a batch of data you want to validate\n", - "\n", - "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# list datasources of the type SqlAlchemyDatasource in your project\n", - "[\n", - " datasource[\"name\"]\n", - " for datasource in context.list_datasources()\n", - " if datasource[\"class_name\"] == \"SqlAlchemyDatasource\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasource_name = None # TODO: set to a datasource name from above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you would like to validate an entire table or view in your database's default schema:\n", - "batch_kwargs = {\"table\": \"YOUR_TABLE\", \"datasource\": datasource_name}\n", - "\n", - "# If you would like to validate an entire table or view from a non-default schema in your database:\n", - "batch_kwargs = {\"table\": \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", \"datasource\": datasource_name}\n", - "\n", - "# If you would like to validate the result set of a query:\n", - "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", - "\n", - "\n", - "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Validate the batch with Validation Operators\n", - "\n", - "`Validation Operators` provide a convenient way to bundle the validation of\n", - "multiple expectation suites and the actions that should be taken after validation.\n", - "\n", - "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", - "\n", - "* validating a group of batches that are logically related\n", - "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", - "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", - "\n", - "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", - "\n", - "\"\"\"\n", - "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. View the Validation Results in Data Docs\n", - "\n", - "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", - "\n", - "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "context.open_data_docs()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations! You ran Validations!\n", - "\n", - "## Next steps:\n", - "\n", - "### 1. Read about the typical workflow with Great Expectations:\n", - "\n", - "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", - "\n", - "### 2. Explore the documentation & community\n", - "\n", - "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css deleted file mode 100644 index 8bf5a15216a85..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css +++ /dev/null @@ -1,22 +0,0 @@ -/*index page*/ -.ge-index-page-site-name-title {} -.ge-index-page-table-container {} -.ge-index-page-table {} -.ge-index-page-table-profiling-links-header {} -.ge-index-page-table-expectations-links-header {} -.ge-index-page-table-validations-links-header {} -.ge-index-page-table-profiling-links-list {} -.ge-index-page-table-profiling-links-item {} -.ge-index-page-table-expectation-suite-link {} -.ge-index-page-table-validation-links-list {} -.ge-index-page-table-validation-links-item {} - -/*breadcrumbs*/ -.ge-breadcrumbs {} -.ge-breadcrumbs-item {} - -/*navigation sidebar*/ -.ge-navigation-sidebar-container {} -.ge-navigation-sidebar-content {} -.ge-navigation-sidebar-title {} -.ge-navigation-sidebar-link {} diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/uncommitted/config_variables.yml b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/uncommitted/config_variables.yml deleted file mode 100644 index dc196f7486ca2..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/uncommitted/config_variables.yml +++ /dev/null @@ -1,18 +0,0 @@ -# This config file supports variable substitution which enables: 1) keeping -# secrets out of source control & 2) environment-based configuration changes -# such as staging vs prod. -# -# When GE encounters substitution syntax (like `my_key: ${my_value}` or -# `my_key: $my_value`) in the great_expectations.yml file, it will attempt -# to replace the value of `my_key` with the value from an environment -# variable `my_value` or a corresponding key read from this config file, -# which is defined through the `config_variables_file_path`. -# Environment variables take precedence over variables defined here. -# -# Substitution values defined here can be a simple (non-nested) value, -# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) -# -# -# https://docs.greatexpectations.io/en/latest/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html - -instance_id: bd20cf38-7fa3-4477-96ed-fad020607cec diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/uncommitted/edit_basic.warning.ipynb b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/uncommitted/edit_basic.warning.ipynb deleted file mode 100644 index 8a34a62819abd..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations/uncommitted/edit_basic.warning.ipynb +++ /dev/null @@ -1,282 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Edit Your Expectation Suite\n", - "Use this notebook to recreate and modify your expectation suite:\n", - "\n", - "**Expectation Suite Name**: `basic.warning`\n", - "\n", - "We'd love it if you **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2020-06-24T09:51:24-0400 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.\n" - ] - }, - { - "ename": "DataContextError", - "evalue": "expectation_suite basic.warning not found", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mDataContextError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;31m# remove the other one.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[0mexpectation_suite_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"basic.warning\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[0msuite\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_expectation_suite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexpectation_suite_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 16\u001b[0m \u001b[0msuite\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexpectations\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\users\\leorf\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\great_expectations\\data_context\\data_context.py\u001b[0m in \u001b[0;36mget_expectation_suite\u001b[1;34m(self, expectation_suite_name)\u001b[0m\n\u001b[0;32m 1154\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1155\u001b[0m raise ge_exceptions.DataContextError(\n\u001b[1;32m-> 1156\u001b[1;33m \u001b[1;34m\"expectation_suite %s not found\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mexpectation_suite_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1157\u001b[0m )\n\u001b[0;32m 1158\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mDataContextError\u001b[0m: expectation_suite basic.warning not found" - ], - "output_type": "error" - } - ], - "source": [ - "import dagstermill\n", - "import json\n", - "import datetime\n", - "import great_expectations as ge\n", - "import great_expectations.jupyter_ux\n", - "from great_expectations.data_context.types.resource_identifiers import (\n", - " ValidationResultIdentifier,\n", - ")\n", - "\n", - "context = ge.data_context.DataContext()\n", - "\n", - "# Feel free to change the name of your suite here. Renaming this will not\n", - "# remove the other one.\n", - "expectation_suite_name = \"basic.warning\"\n", - "suite = context.get_expectation_suite(expectation_suite_name)\n", - "suite.expectations = []\n", - "\n", - "batch_kwargs = {\n", - " \"data_asset_name\": \"basic\",\n", - " \"datasource\": \"blink\",\n", - " \"path\": \"C:\\\\Users\\\\leorf\\\\PycharmProjects\\\\elementl\\\\dagster\\\\python_modules\\\\libraries\\\\dagster-ge\\\\great_expectations\\\\..\\\\.\\\\basic.csv\",\n", - "}\n", - "batch = context.get_batch(batch_kwargs, suite)\n", - "batch.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create & Edit Expectations\n", - "\n", - "Add expectations by calling specific expectation methods on the `batch` object. They all begin with `.expect_` which makes autocompleting easy using tab.\n", - "\n", - "You can see all the available expectations in the **[expectation glossary](https://docs.greatexpectations.io/en/latest/reference/glossary_of_expectations.html?utm_source=notebook&utm_medium=create_expectations)**." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Table Expectation(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_table_row_count_to_be_between(max_value=33, min_value=27)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_table_column_count_to_equal(value=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_table_columns_to_match_ordered_list(\n", - " column_list=[\"Team\", ' \"Payroll (millions)\"', ' \"Wins\"']\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Column Expectation(s)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### ` \"Payroll (millions)\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_values_to_not_be_null(' \"Payroll (millions)\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_min_to_be_between(' \"Payroll (millions)\"', max_value=56.24, min_value=54.24)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_max_to_be_between(' \"Payroll (millions)\"', max_value=198.96, min_value=196.96)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_mean_to_be_between(\n", - " ' \"Payroll (millions)\"', max_value=99.01899999999998, min_value=97.01899999999998\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_median_to_be_between(' \"Payroll (millions)\"', max_value=86.75, min_value=84.75)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_quantile_values_to_be_between(\n", - " ' \"Payroll (millions)\"',\n", - " allow_relative_error=False,\n", - " quantile_ranges={\n", - " \"quantiles\": [0.05, 0.25, 0.5, 0.75, 0.95],\n", - " \"value_ranges\": [\n", - " [54.37, 56.37],\n", - " [74.48, 76.48],\n", - " [82.31, 84.31],\n", - " [116.62, 118.62],\n", - " [173.54, 175.54],\n", - " ],\n", - " },\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `Team`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_values_to_not_be_null(\"Team\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.expect_column_value_lengths_to_be_between(\"Team\", min_value=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save & Review Your Expectations\n", - "\n", - "Let's save the expectation suite as a JSON file in the `great_expectations/expectations` directory of your project.\n", - "If you decide not to save some expectations that you created, use [remove_expectation method](https://docs.greatexpectations.io/en/latest/module_docs/data_asset_module.html?highlight=remove_expectation&utm_source=notebook&utm_medium=edit_expectations#great_expectations.data_asset.data_asset.DataAsset.remove_expectation).\n", - "\n", - "Let's now rebuild your Data Docs, which helps you communicate about your data with both machines and humans." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch.save_expectation_suite(discard_failed_expectations=False)\n", - "\n", - "\"\"\"\n", - "Let's create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", - "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", - "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", - "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", - "be None and run_time will default to the current UTC datetime.\n", - "\"\"\"\n", - "\n", - "run_id = {\n", - " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", - " \"run_time\": datetime.datetime.now(datetime.timezone.utc),\n", - "}\n", - "results = context.run_validation_operator(\n", - " \"action_list_operator\", assets_to_validate=[batch], run_id=run_id\n", - ")\n", - "dagstermill.yield_result(json.loads(str(results.list_validation_results())))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dagster", - "language": "python", - "name": "dagster" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/expectations/basic/warning.json b/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/expectations/basic/warning.json deleted file mode 100644 index 4ac8f88e2b573..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/expectations/basic/warning.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "data_asset_type": "Dataset", - "expectation_suite_name": "basic.warning", - "expectations": [ - { - "expectation_type": "expect_table_row_count_to_be_between", - "kwargs": { - "max_value": 33, - "min_value": 27 - }, - "meta": {} - }, - { - "expectation_type": "expect_table_column_count_to_equal", - "kwargs": { - "value": 3 - }, - "meta": {} - }, - { - "expectation_type": "expect_table_columns_to_match_ordered_list", - "kwargs": { - "column_list": [ - "Team", - " \"Payroll (millions)\"", - " \"Wins\"" - ] - }, - "meta": {} - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": " \"Payroll (millions)\"" - }, - "meta": {} - }, - { - "expectation_type": "expect_column_min_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 56.24, - "min_value": 54.24 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_max_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 198.96, - "min_value": 196.96 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_mean_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 99.01899999999998, - "min_value": 97.01899999999998 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_median_to_be_between", - "kwargs": { - "column": " \"Payroll (millions)\"", - "max_value": 86.75, - "min_value": 84.75 - }, - "meta": {} - }, - { - "expectation_type": "expect_column_quantile_values_to_be_between", - "kwargs": { - "allow_relative_error": false, - "column": " \"Payroll (millions)\"", - "quantile_ranges": { - "quantiles": [ - 0.05, - 0.25, - 0.5, - 0.75, - 0.95 - ], - "value_ranges": [ - [ - 54.37, - 56.37 - ], - [ - 74.48, - 76.48 - ], - [ - 82.31, - 84.31 - ], - [ - 116.62, - 118.62 - ], - [ - 173.54, - 175.54 - ] - ] - } - }, - "meta": {} - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "Team" - }, - "meta": {} - }, - { - "expectation_type": "expect_column_value_lengths_to_be_between", - "kwargs": { - "column": "Team", - "min_value": 1 - }, - "meta": {} - } - ], - "meta": { - "BasicSuiteBuilderProfiler": { - "batch_kwargs": { - "data_asset_name": "basic", - "datasource": "blink", - "path": "C:\\Users\\leorf\\PycharmProjects\\elementl\\dagster\\python_modules\\libraries\\dagster-ge\\great_expectations\\..\\.\\basic.csv" - }, - "created_at": 1592942378.2444508, - "created_by": "BasicSuiteBuilderProfiler" - }, - "citations": [ - { - "batch_kwargs": { - "data_asset_name": "basic", - "datasource": "blink", - "path": "C:\\Users\\leorf\\PycharmProjects\\elementl\\dagster\\python_modules\\libraries\\dagster-ge\\great_expectations\\..\\.\\basic.csv" - }, - "batch_markers": { - "ge_load_time": "20200623T195938.167653Z", - "pandas_data_fingerprint": "8c46fdaf0bd356fd58b7bcd9b2e6012d" - }, - "batch_parameters": null, - "citation_date": "20200623T195938.274371Z", - "comment": "BasicSuiteBuilderProfiler added a citation based on the current batch." - } - ], - "columns": { - " \"Payroll (millions)\"": { - "description": "" - }, - " \"Wins\"": { - "description": "" - }, - "Team": { - "description": "" - } - }, - "great_expectations.__version__": "0.11.5", - "notes": { - "content": [ - "#### This is an _example_ suite\n\n- This suite was made by quickly glancing at 1000 rows of your data.\n- This is **not a production suite**. It is meant to show examples of expectations.\n- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful.\n" - ], - "format": "markdown" - } - } -} \ No newline at end of file diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/.gitignore b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/.gitignore similarity index 100% rename from python_modules/libraries/dagster-ge/dagster_ge_tests/great_expectations_v3/.gitignore rename to python_modules/libraries/dagster-ge/dagster_ge_tests/gx/.gitignore diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/expectations/.ge_store_backend_id b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/expectations/.ge_store_backend_id new file mode 100644 index 0000000000000..4f55d2a324871 --- /dev/null +++ b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/expectations/.ge_store_backend_id @@ -0,0 +1 @@ +store_backend_id = fae1459e-fbb0-4ef6-b665-eaabc63e9041 diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/expectations/basic/warning.json b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/expectations/basic/warning.json similarity index 100% rename from python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/expectations/basic/warning.json rename to python_modules/libraries/dagster-ge/dagster_ge_tests/gx/expectations/basic/warning.json diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/great_expectations.yml b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/great_expectations.yml new file mode 100644 index 0000000000000..3f8bc71c14e64 --- /dev/null +++ b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/great_expectations.yml @@ -0,0 +1,103 @@ +# Welcome to Great Expectations! Always know what to expect from your data. +# +# Here you can define datasources, batch kwargs generators, integrations and +# more. This file is intended to be committed to your repo. For help with +# configuration please: +# - Read our docs: https://docs.greatexpectations.io/en/latest/reference/spare_parts/data_context_reference.html#configuration +# - Join our slack channel: http://greatexpectations.io/slack + +# config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility +# It is auto-generated and usually does not need to be changed. +config_version: 3.0 + +# Datasources tell Great Expectations where your data lives and how to get it. +# You can use the CLI command `great_expectations datasource new` to help you +# add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource.html +datasources: + getestspark: + class_name: Datasource + module_name: great_expectations.datasource + data_connectors: + my_runtime_data_connector: + class_name: RuntimeDataConnector + module_name: great_expectations.datasource.data_connector + batch_identifiers: + - foo + name: my_runtime_data_connector + execution_engine: + module_name: great_expectations.execution_engine + class_name: SparkDFExecutionEngine + getest: + class_name: Datasource + module_name: great_expectations.datasource + data_connectors: + my_runtime_data_connector: + class_name: RuntimeDataConnector + module_name: great_expectations.datasource.data_connector + batch_identifiers: + - foo + name: my_runtime_data_connector + execution_engine: + module_name: great_expectations.execution_engine + class_name: PandasExecutionEngine +config_variables_file_path: uncommitted/config_variables.yml + +# The plugins_directory will be added to your python path for custom modules +# used to override and extend Great Expectations. +plugins_directory: plugins/ + +stores: + # Stores are configurable places to store things like Expectations, Validations + # Data Docs, and more. These are for advanced users only - most users can simply + # leave this section alone. + # + # Three stores are required: expectations, validations, and + # evaluation_parameters, and must exist with a valid store entry. Additional + # stores can be configured for uses such as data_docs, etc. + expectations_store: + class_name: ExpectationsStore + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: expectations/ + + validations_store: + class_name: ValidationsStore + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: uncommitted/validations/ + + evaluation_parameter_store: + class_name: EvaluationParameterStore + checkpoint_store: + class_name: CheckpointStore + store_backend: + class_name: TupleFilesystemStoreBackend + suppress_store_backend_id: true + base_directory: checkpoints/ + +expectations_store_name: expectations_store +validations_store_name: validations_store +evaluation_parameter_store_name: evaluation_parameter_store +checkpoint_store_name: checkpoint_store + +data_docs_sites: + # Data Docs make it simple to visualize data quality in your project. These + # include Expectations, Validations & Profiles. The are built for all + # Datasources from JSON artifacts in the local repo including validations & + # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html + local_site: + class_name: SiteBuilder + show_how_to_buttons: true + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: uncommitted/data_docs/local_site/ + site_index_builder: + class_name: DefaultSiteIndexBuilder +notebooks: +anonymous_usage_statistics: + data_context_id: fae1459e-fbb0-4ef6-b665-eaabc63e9041 + enabled: true +include_rendered_content: + expectation_suite: false + expectation_validation_result: false + globally: false diff --git a/python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/python_modules/libraries/dagster-ge/dagster_ge_tests/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css similarity index 100% rename from python_modules/libraries/dagster-ge/dagster_ge/examples/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css rename to python_modules/libraries/dagster-ge/dagster_ge_tests/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/test_basic_integ.py b/python_modules/libraries/dagster-ge/dagster_ge_tests/test_basic_integ.py deleted file mode 100644 index ac9c2e1bb0dab..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/test_basic_integ.py +++ /dev/null @@ -1,112 +0,0 @@ -import pytest -from dagster import In, Output, job, op -from dagster._utils import file_relative_path -from dagster_ge.factory import ( - ge_data_context, - ge_validation_op_factory, - ge_validation_op_factory_v3, -) -from dagster_pyspark import ( - DataFrame as DagsterPySparkDataFrame, - pyspark_resource, -) -from pandas import read_csv - - -@op -def pandas_yielder(_): - return read_csv(file_relative_path(__file__, "./basic.csv")) - - -@op(required_resource_keys={"pyspark"}) -def pyspark_yielder(context): - return ( - context.resources.pyspark.spark_session.read.format("csv") - .options(header="true", inferSchema="true") - .load(file_relative_path(__file__, "./basic.csv")) - ) - - -@op(ins={"res": In()}) -def reyielder(_context, res): - yield Output((res["statistics"], res["results"])) - - -@job(resource_defs={"ge_data_context": ge_data_context}) -def hello_world_pandas_job_v2(): - reyielder( - ge_validation_op_factory("ge_validation_op", "getest", "basic.warning")(pandas_yielder()) - ) - - -@job(resource_defs={"ge_data_context": ge_data_context}) -def hello_world_pandas_job_v3(): - reyielder( - ge_validation_op_factory_v3( - name="ge_validation_op", - datasource_name="getest", - data_connector_name="my_runtime_data_connector", - data_asset_name="test_asset", - suite_name="basic.warning", - batch_identifiers={"foo": "bar"}, - )(pandas_yielder()) - ) - - -@job( - resource_defs={ - "ge_data_context": ge_data_context, - "pyspark": pyspark_resource, - } -) -def hello_world_pyspark_job(): - validate = ge_validation_op_factory( - "ge_validation_op", - "getestspark", - "basic.warning", - input_dagster_type=DagsterPySparkDataFrame, - ) - reyielder(validate(pyspark_yielder())) - - -@pytest.mark.parametrize( - "job_def, ge_dir", - [ - (hello_world_pandas_job_v2, "./great_expectations"), - (hello_world_pandas_job_v3, "./great_expectations_v3"), - ], -) -def test_yielded_results_config_pandas(snapshot, job_def, ge_dir): - run_config = { - "resources": { - "ge_data_context": {"config": {"ge_root_dir": file_relative_path(__file__, ge_dir)}} - } - } - result = job_def.execute_in_process(run_config=run_config) - assert result.output_for_node("reyielder")[0]["success_percent"] == 100 - expectations = result.expectation_results_for_node("ge_validation_op") - assert len(expectations) == 1 - mainexpect = expectations[0] - assert mainexpect.success - # purge system specific metadata for testing - metadata = mainexpect.metadata["Expectation Results"].md_str.split("### Info")[0] - snapshot.assert_match(metadata) - - -def test_yielded_results_config_pyspark_v2(snapshot): # pylint:disable=unused-argument - run_config = { - "resources": { - "ge_data_context": { - "config": {"ge_root_dir": file_relative_path(__file__, "./great_expectations")} - } - } - } - result = hello_world_pyspark_job.execute_in_process(run_config=run_config) - assert result.output_for_node("reyielder")[0]["success_percent"] == 100 - expectations = result.expectation_results_for_node("ge_validation_op") - assert len(expectations) == 1 - mainexpect = expectations[0] - assert mainexpect.success - # purge system specific metadata for testing - metadata = mainexpect.metadata["Expectation Results"].md_str.split("### Info")[0] - snapshot.assert_match(metadata) diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/test_resources.py b/python_modules/libraries/dagster-ge/dagster_ge_tests/test_resources.py deleted file mode 100644 index c8ab633d8116e..0000000000000 --- a/python_modules/libraries/dagster-ge/dagster_ge_tests/test_resources.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest -from dagster import Output, job, op -from dagster._utils import file_relative_path -from dagster_ge.factory import ( - GEContextResource, - ge_validation_op_factory, - ge_validation_op_factory_v3, -) -from pandas import read_csv - - -@op -def pandas_yielder(_): - return read_csv(file_relative_path(__file__, "./basic.csv")) - - -@op -def reyielder(_context, res): - yield Output((res["statistics"], res["results"])) - - -@job -def hello_world_pandas_job_v2(): - reyielder( - ge_validation_op_factory("ge_validation_op", "getest", "basic.warning")(pandas_yielder()) - ) - - -@job -def hello_world_pandas_job_v3(): - reyielder( - ge_validation_op_factory_v3( - name="ge_validation_op", - datasource_name="getest", - data_connector_name="my_runtime_data_connector", - data_asset_name="test_asset", - suite_name="basic.warning", - batch_identifiers={"foo": "bar"}, - )(pandas_yielder()) - ) - - -@pytest.mark.parametrize( - "job_def, ge_dir", - [ - (hello_world_pandas_job_v2, "./great_expectations"), - (hello_world_pandas_job_v3, "./great_expectations_v3"), - ], -) -def test_yielded_results_config_pandas(snapshot, job_def, ge_dir): - result = job_def.execute_in_process( - resources={ - "ge_data_context": GEContextResource(ge_root_dir=file_relative_path(__file__, ge_dir)) - }, - ) - assert result.output_for_node("reyielder")[0]["success_percent"] == 100 - expectations = result.expectation_results_for_node("ge_validation_op") - assert len(expectations) == 1 - mainexpect = expectations[0] - assert mainexpect.success - # purge system specific metadata for testing - metadata = mainexpect.metadata["Expectation Results"].md_str.split("### Info")[0] - snapshot.assert_match(metadata) diff --git a/python_modules/libraries/dagster-ge/dagster_ge_tests/test_validation.py b/python_modules/libraries/dagster-ge/dagster_ge_tests/test_validation.py new file mode 100644 index 0000000000000..6cd1e91664037 --- /dev/null +++ b/python_modules/libraries/dagster-ge/dagster_ge_tests/test_validation.py @@ -0,0 +1,145 @@ +import pytest +from dagster import In, Output, graph, op +from dagster._core.definitions.metadata.metadata_value import MarkdownMetadataValue +from dagster._core.execution.context.op_execution_context import OpExecutionContext +from dagster._utils import file_relative_path +from dagster_ge.factory import GEContextResource, ge_data_context, ge_validation_op_factory +from dagster_pyspark import ( + DataFrame as DagsterPySparkDataFrame, + pyspark_resource, +) +from pandas import read_csv + + +@op(ins={"res": In()}) +def unpack_ge_results(_context, res): + yield Output((res["statistics"], res["results"])) + + +# ######################## +# ##### PANDAS +# ######################## + + +@op +def pandas_loader(_): + return read_csv(file_relative_path(__file__, "./basic.csv")) + + +pandas_validator = ge_validation_op_factory( + name="ge_validation_op", + datasource_name="getest", + data_connector_name="my_runtime_data_connector", + data_asset_name="test_asset", + suite_name="basic.warning", + batch_identifiers={"foo": "bar"}, +) + + +@graph +def pandas_graph(): + unpack_ge_results(pandas_validator(pandas_loader())) + + +pandas_job_with_resource = pandas_graph.to_job(resource_defs={"ge_data_context": ge_data_context}) + +pandas_job_no_resource = pandas_graph.to_job() + +# ######################## +# ##### PYSPARK +# ######################## + + +@op(required_resource_keys={"pyspark"}) +def pyspark_loader(context: OpExecutionContext): + return ( + context.resources.pyspark.spark_session.read.format("csv") + .options(header="true", inferSchema="true") + .load(file_relative_path(__file__, "./basic.csv")) + ) + + +pyspark_validator = ge_validation_op_factory( + name="ge_validation_op", + datasource_name="getestspark", + data_connector_name="my_runtime_data_connector", + data_asset_name="test_asset", + suite_name="basic.warning", + input_dagster_type=DagsterPySparkDataFrame, + batch_identifiers={"foo": "bar"}, +) + + +@graph +def pyspark_graph(): + return unpack_ge_results(pyspark_validator(pyspark_loader())) + + +pyspark_job_with_resource = pyspark_graph.to_job( + resource_defs={ + "ge_data_context": ge_data_context, + "pyspark": pyspark_resource, + } +) + +pyspark_job_no_resource = pyspark_graph.to_job() + +# ######################## +# ##### TESTS +# ######################## + +_GE_ROOT_DIR = file_relative_path(__file__, "./gx") + + +@pytest.mark.parametrize( + "data_backend, resource_style", + [ + ("pandas", "new"), + ("pandas", "old"), + ("pyspark", "new"), + ("pyspark", "old"), + ], +) +def test_ge_validation(snapshot, data_backend: str, resource_style: str): + # Used for old resource style + run_config = {"resources": {"ge_data_context": {"config": {"ge_root_dir": _GE_ROOT_DIR}}}} + + # Used for new resource style + ge_resource = GEContextResource(ge_root_dir=_GE_ROOT_DIR) + + # Compute result based on data backend and resource style. When resource_style=old, resources + # are set on the job and are configured via the passed run_config. When resource_style=new, + # resources are passed directly into `execute_in_process`. + if data_backend == "pandas" and resource_style == "new": + result = pandas_job_no_resource.execute_in_process( + resources={ + "ge_data_context": ge_resource, + }, + ) + elif data_backend == "pandas" and resource_style == "old": + result = pandas_job_with_resource.execute_in_process(run_config=run_config) + elif data_backend == "pyspark" and resource_style == "new": + result = pyspark_job_no_resource.execute_in_process( + resources={ + "ge_data_context": ge_resource, + "pyspark": pyspark_resource, + }, + ) + elif data_backend == "pyspark" and resource_style == "old": + result = pyspark_job_with_resource.execute_in_process(run_config=run_config) + else: + raise ValueError("Invalid combination") + + assert result.output_for_node("unpack_ge_results")[0]["success_percent"] == 100 + expectations = result.expectation_results_for_node("ge_validation_op") + assert len(expectations) == 1 + mainexpect = expectations[0] + assert mainexpect.success + # purge system specific metadata for testing + result_markdown_metadata = mainexpect.metadata["Expectation Results"] + assert ( + isinstance(result_markdown_metadata, MarkdownMetadataValue) + and result_markdown_metadata.md_str + ) + result_markdown = result_markdown_metadata.md_str.split("### Info")[0] + snapshot.assert_match(result_markdown) diff --git a/python_modules/libraries/dagster-ge/setup.py b/python_modules/libraries/dagster-ge/setup.py index 74397942f71af..55bcb149cb356 100644 --- a/python_modules/libraries/dagster-ge/setup.py +++ b/python_modules/libraries/dagster-ge/setup.py @@ -36,9 +36,7 @@ def get_version() -> str: f"dagster{pin}", f"dagster-pandas{pin}", "pandas", - # Pin numpy pending update of great_expectations - "numpy<2", - "great_expectations >=0.11.9, !=0.12.8, !=0.13.17, !=0.13.27, <0.17.12", + "great_expectations >=0.17.15", ], zip_safe=False, ) diff --git a/scripts/install_dev_python_modules.py b/scripts/install_dev_python_modules.py index af860e611b140..d97c6bff7c7c9 100644 --- a/scripts/install_dev_python_modules.py +++ b/scripts/install_dev_python_modules.py @@ -70,6 +70,7 @@ def main( "python_modules/libraries/dagster-gcp", "python_modules/libraries/dagster-gcp-pandas", "python_modules/libraries/dagster-gcp-pyspark", + "python_modules/libraries/dagster-ge", "python_modules/libraries/dagster-embedded-elt", "python_modules/libraries/dagster-fivetran", "python_modules/libraries/dagster-k8s", @@ -136,17 +137,6 @@ def main( "https://github.com/dagster-io/build-grpcio/wiki/Wheels", ] - # NOTE: `dagster-ge` is out of date and does not support recent versions of great expectations. - # Because of this, it has second-order dependencies on old versions of popular libraries like - # numpy which conflict with the requirements of our other libraries. For this reason, until - # dagster-ge is updated we won't install `dagster-ge` in the common dev environment or - # pre-install its dependencies in our BK images (which this script is used for). - # - # dagster-ge depends on a great_expectations version that does not install on Windows - # https://github.com/dagster-io/dagster/issues/3319 - # if sys.version_info >= (3, 7) and os.name != "nt": - # install_targets += ["-e python_modules/libraries/dagster-ge"] - # Ensure uv is installed which we use for faster package resolution subprocess.run(["pip", "install", "-U", "uv"], check=True)