Skip to content

Commit

Permalink
Merge branch 'main' into milestone-2-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
dariakhv authored Dec 13, 2024
2 parents 119e66c + 3eb8d8c commit c952f35
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 16 deletions.
18 changes: 3 additions & 15 deletions scripts/validation_before_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import click
import pandas as pd
import pandera as pa
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.validate_column_names import validate_column_names

# Define schemas for validation
def define_schemas():
Expand Down Expand Up @@ -59,21 +62,6 @@ def define_schemas():
return general_schema, outlier_schema, category_schema, duplicate_check, empty_row_check


# Validate column names
def validate_column_names(wine, correct_columns):
extracted_columns = set(wine.columns)
if extracted_columns != correct_columns:
wrong_columns = extracted_columns.difference(correct_columns)
missing_columns = correct_columns.difference(extracted_columns)
if wrong_columns and missing_columns:
raise ValueError(f"Unexpected columns: {list(wrong_columns)}, missing columns: {list(missing_columns)}")
elif wrong_columns:
raise ValueError(f"Unexpected columns: {list(wrong_columns)}")
elif missing_columns:
raise ValueError(f"Missing columns: {list(missing_columns)}")
else:
print("Column name test passed!")

@click.command()
@click.option("--file_name", required=True, help="Name of the input CSV file.")
@click.option("--data_path", required=True, help="Path to the directory containing the file.")
Expand Down
48 changes: 48 additions & 0 deletions src/validate_column_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
def validate_column_names(wine, correct_columns):
"""
This function validates that the column names of the provided DataFrame match the expected column names.
Parameters:
----------
wine : pandas.DataFrame
The DataFrame to validate.
correct_columns : set
A set of expected column names.
Raises:
------
ValueError
If the column names in the DataFrame don't match the expected column names.
The error will specify:
- Unexpected columns (columns present in the DataFrame but not in the expected set).
- Missing columns (columns expected but not present in the DataFrame).
Returns:
-------
None
If the column names match the expected set, the function will print "Column name test passed!".
Example:
-------
>>> import pandas as pd
>>> wine_df = pd.DataFrame(columns=["feature1", "feature2", "target"])
>>> expected_columns = {"feature1", "feature2", "target"}
>>> validate_column_names(wine_df, expected_columns)
Column name test passed!
>>> incorrect_df = pd.DataFrame(columns=["feature1", "feature3"])
>>> validate_column_names(incorrect_df, expected_columns)
ValueError: Unexpected columns: ['feature3'], missing columns: ['feature2', 'target']
"""
extracted_columns = set(wine.columns)
if extracted_columns != correct_columns:
wrong_columns = extracted_columns.difference(correct_columns)
missing_columns = correct_columns.difference(extracted_columns)
if wrong_columns and missing_columns:
raise ValueError(f"Unexpected columns: {list(wrong_columns)}, missing columns: {list(missing_columns)}")
elif wrong_columns:
raise ValueError(f"Unexpected columns: {list(wrong_columns)}")
elif missing_columns:
raise ValueError(f"Missing columns: {list(missing_columns)}")
else:
print("Column name test passed!")
6 changes: 6 additions & 0 deletions tests/.ipynb_checkpoints/README-checkpoint
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pytest -v test_random_search.py \
--train_data=../data/proc/wine_train.csv \
--test_data=../data/proc/wine_test.csv \
--pipeline_path=../results/models/wine_pipeline.pickle

pytest test_validate_column_names.py
5 changes: 4 additions & 1 deletion tests/README
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Random Search Test
pytest -v test_random_search.py \
--train_data=../data/proc/wine_train.csv \
--test_data=../data/proc/wine_test.csv \
--pipeline_path=../results/models/wine_pipeline.pickle

pytest test_validate_column_names.py

pytest test_split_data.py
43 changes: 43 additions & 0 deletions tests/test_validate_column_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pytest
import os
import pandas as pd
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.validate_column_names import validate_column_names

# Correct columns
correct_columns = {"column1", "column2", "column3"}

# Test data
correct_df = pd.DataFrame(columns=["column1", "column2", "column3"])
extra_column_df = pd.DataFrame(columns=["column1", "column2", "column3", "extra_column"])
missing_column_df = pd.DataFrame(columns=["column1", "column2"])
wrong_column_df = pd.DataFrame(columns=["wrong_column1", "wrong_column2", "wrong_column3"])

# Test for correct column names
def test_validate_column_names_correct():
try:
validate_column_names(correct_df, correct_columns)
except ValueError:
pytest.fail("validate_column_names raised ValueError unexpectedly for correct columns.")

# Test for extra column
def test_validate_column_names_extra_column():
with pytest.raises(ValueError, match="Unexpected columns:"):
validate_column_names(extra_column_df, correct_columns)

# Test for missing column
def test_validate_column_names_missing_column():
with pytest.raises(ValueError, match="Missing columns:"):
validate_column_names(missing_column_df, correct_columns)

# Test for completely wrong columns
def test_validate_column_names_wrong_column():
with pytest.raises(ValueError, match="Unexpected columns:"):
validate_column_names(wrong_column_df, correct_columns)

# Test for both extra and missing columns
def test_validate_column_names_extra_and_missing():
mixed_df = pd.DataFrame(columns=["column1", "extra_column"])
with pytest.raises(ValueError, match="Unexpected columns:.*missing columns:"):
validate_column_names(mixed_df, correct_columns)

0 comments on commit c952f35

Please sign in to comment.