From eb6b035cd3ad39bb0d6d342975458890dda55762 Mon Sep 17 00:00:00 2001 From: Niels Bantilan Date: Wed, 15 Mar 2023 18:25:37 -0400 Subject: [PATCH] fix regression: all missing columns should be reported (#1117) Signed-off-by: Niels Bantilan --- pandera/backends/pandas/container.py | 37 +++++++++++++++++++--------- tests/core/test_schemas.py | 21 ++++++++++++++++ 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/pandera/backends/pandas/container.py b/pandera/backends/pandas/container.py index dfbb7f74a..82c356dd8 100644 --- a/pandera/backends/pandas/container.py +++ b/pandera/backends/pandas/container.py @@ -67,8 +67,12 @@ def validate( try: self.check_column_presence(check_obj, schema, column_info) - except SchemaError as exc: - error_handler.collect_error(exc.reason_code, exc) + except SchemaErrors as exc: + for schema_error in exc.schema_errors: + error_handler.collect_error( + schema_error["reason_code"], + schema_error["error"], + ) # strictness check and filter try: @@ -471,18 +475,27 @@ def check_column_presence( ): """Check for presence of specified columns in the data object.""" if column_info.absent_column_names: - # NOTE: only report the first absent column for now, need to update - # this when backend stuff is complete - colname, *_ = column_info.absent_column_names - raise SchemaError( + reason_code = "column_not_in_dataframe" + raise SchemaErrors( schema=schema, + schema_errors=[ + { + "reason_code": reason_code, + "error": SchemaError( + schema=schema, + data=check_obj, + message=( + f"column '{colname}' not in dataframe" + f"\n{check_obj.head()}" + ), + failure_cases=scalar_failure_case(colname), + check="column_in_dataframe", + reason_code=reason_code, + ), + } + for colname in column_info.absent_column_names + ], data=check_obj, - message=( - f"column '{colname}' not in dataframe\n{check_obj.head()}" - ), - failure_cases=scalar_failure_case(colname), - check="column_in_dataframe", - reason_code="column_not_in_dataframe", ) def check_column_values_are_unique(self, check_obj: pd.DataFrame, schema): diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py index 833473165..eee6a13d6 100644 --- a/tests/core/test_schemas.py +++ b/tests/core/test_schemas.py @@ -1965,3 +1965,24 @@ def test_column_set_unique(): assert not test_schema.columns["a"].unique test_schema = test_schema.update_column("a", unique=True) assert test_schema.columns["a"].unique + + +def test_missing_columns(): + """Test that multiple missing columns is correctly reported.""" + schema = DataFrameSchema( + { + "column3": Column(int), + "column2": Column(float), + } + ) + + df = pd.DataFrame({"column1": [1]}) + + try: + schema.validate(df, lazy=True) + except errors.SchemaErrors as exc: + assert (exc.failure_cases["check"] == "column_in_dataframe").all() + assert exc.failure_cases["failure_case"].tolist() == [ + "column3", + "column2", + ]