Skip to content

Commit

Permalink
Fix partial dependence bugs with dfs transfomer (#3830)
Browse files Browse the repository at this point in the history
* Allow engineered features with dfs transfomer to have pd calcualtions

* Add release note

* PR comments

* Allow multi output features to be handled correctly by dfs transformer

* Pass pipeline target into fast mode component handlers

* Add tests for the second two bug fixes

* clean up

* PR comments

* PR comments
  • Loading branch information
tamargrey authored Nov 18, 2022
1 parent b9ca2b6 commit 459ba58
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 14 deletions.
3 changes: 3 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ Release Notes
* Updated demo dataset links to point to new endpoint :pr:`3826`
* Updated ``STLDecomposer`` to infer the time index frequency if it's not present :pr:`3829`
* Updated ``_drop_time_index`` to move the time index from X to both ``X.index`` and ``y.index`` :pr:`3829`
* Fixed bug where engineered features lost their origin attribute in partial dependence, causing it to fail :pr:`3830`
* Fixed bug where partial dependence's fast mode handling for the DFS Transformer wouldn't work with multi output features :pr:`3830`
* Allowed target to be present and ignored in partial dependence's DFS Transformer fast mode handling :pr:`3830`
* Changes
* Consolidated decomposition frequency validation logic to ``Decomposer`` class :pr:`3811`
* Removed Featuretools version upper bound and prevent Woodwork 0.20.0 from being installed :pr:`3813`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def _get_cloned_feature_pipelines(
new_parameters = pipeline.parameters
for component in pipeline.component_graph.component_instances.values():
new_parameters = component._handle_partial_dependence_fast_mode(
X_train,
new_parameters,
X=X_train,
target=pipeline.input_target_name,
)

# Create a fit pipeline for each feature
Expand Down Expand Up @@ -77,9 +78,7 @@ def _transform_single_feature(
fit for it.
"""
changed_col_df = pd.DataFrame({variable: part_dep_column})
changed_col_df.ww.init(
logical_types={variable: X.ww.logical_types[variable]},
)
changed_col_df.ww.init(schema=X.ww.schema.get_subset_schema([variable]))

# Take the changed column and send it through transform by itself
X_t_single_col = cloned_pipeline.transform_all_but_final(changed_col_df)
Expand Down
2 changes: 1 addition & 1 deletion evalml/model_understanding/_partial_dependence_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,8 @@ def _partial_dependence_calculation(
X_eval.ww[variable] = ww.init_series(
part_dep_column,
logical_type=X_eval.ww.logical_types[variable],
origin=X_eval.ww.columns[variable].origin,
)

pred = prediction_method(X_eval)
predictions.append(pred)
# average over samples
Expand Down
10 changes: 8 additions & 2 deletions evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,19 @@ def default_parameters(cls):
def _supported_by_list_API(cls):
return not cls.modifies_target

def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
def _handle_partial_dependence_fast_mode(
self,
pipeline_parameters,
X=None,
target=None,
):
"""Determines whether or not a component can be used with partial dependence's fast mode.
Args:
X (pd.DataFrame): Holdout data being used for partial dependence calculations.
pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
used in partial dependence fast mode.
X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations.
target (str, optional): The target whose values we are trying to predict.
"""
if self._can_be_used_for_fast_partial_dependence:
return pipeline_parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,12 @@ def fit_transform(self, X, y=None):
"""
return self.fit(X, y).transform(X, y)

def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
def _handle_partial_dependence_fast_mode(
self,
pipeline_parameters,
X=None,
target=None,
):
"""Updates pipeline parameters to not drop any features based off of feature importance.
This is needed, because fast mode refits cloned pipelines on single columns,
Expand All @@ -81,9 +86,10 @@ def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
pipeline to determine if that feature gets dropped or not.
Args:
X (pd.DataFrame): Holdout data being used for partial dependence calculations.
pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
used in partial dependence fast mode.
X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations.
target (str, optional): The target whose values we are trying to predict.
Return:
pipeline_parameters (dict): Pipeline parameters updated to allow the FeatureSelector component
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,26 +129,36 @@ def transform(self, X, y=None):
feature_matrix.ww.init(schema=partial_schema)
return feature_matrix

def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
"""Determines whether or not a DFSTransformer component can be used with partial dependence's fast mode.
def _handle_partial_dependence_fast_mode(self, pipeline_parameters, X, target):
"""Determines whether or not a DFS Transformer component can be used with partial dependence's fast mode.
Note:
This component can be used with partial dependence fast mode only when
all of the features present in the ``features`` parameter are present
in the DataFrame.
Args:
X (pd.DataFrame): Holdout data being used for partial dependence calculations.
pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
used in partial dependence fast mode.
X (pd.DataFrame): Holdout data being used for partial dependence calculations.
target (str): The target whose values we are trying to predict. This is used
to know which column to ignore if the target column is present in the list of features
in the DFS Transformer's parameters
"""
dfs_transformer = pipeline_parameters.get("DFS Transformer")
if dfs_transformer is not None:
dfs_features = dfs_transformer["features"]
# remove the target if it's there
dfs_feature_names = [
name
for feature in dfs_features
for name in feature.get_feature_names()
if name != target
]
X_cols = set(X.columns)

if dfs_features is None or any(
f.get_name() not in X_cols for f in dfs_features
name not in X_cols for name in dfs_feature_names
):
raise ValueError(
"Cannot use fast mode with DFS Transformer when features are unspecified or not all present in X.",
Expand Down
140 changes: 140 additions & 0 deletions evalml/tests/model_understanding_tests/test_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2794,3 +2794,143 @@ def test_partial_dependence_fast_mode_errors_if_train(
fast_mode=True,
y_train=y,
)


@pytest.mark.parametrize("fast_mode", [True, False])
def test_partial_dependence_on_engineered_feature_with_dfs_transformer(
fast_mode,
X_y_binary,
):
X, y = X_y_binary
X = pd.DataFrame(X)
X.columns = X.columns.astype(str)

es = ft.EntitySet()
es = es.add_dataframe(
dataframe_name="X",
dataframe=X,
index="index",
make_index=True,
)
X_fm, features = ft.dfs(
entityset=es,
target_dataframe_name="X",
trans_primitives=["absolute"],
)

dfs_transformer = DFSTransformer(features=features)
pipeline = BinaryClassificationPipeline(
[dfs_transformer, "Standard Scaler", "Random Forest Classifier"],
)

# Engineered features have the their origins specified as either "base" or "engineered"
# it has to remain set for partial dependence to be able to predict on the updated data
engineered_feature = "ABSOLUTE(1)"
assert X_fm.ww.columns[engineered_feature].origin == "engineered"

pipeline.fit(X_fm, y)
part_dep = partial_dependence(
pipeline,
X_fm,
features=engineered_feature,
grid_resolution=2,
fast_mode=fast_mode,
X_train=X_fm,
y_train=y,
)

assert part_dep.feature_values.notnull().all()
assert part_dep.partial_dependence.notnull().all()


@pytest.mark.parametrize("fast_mode", [True, False])
def test_partial_dependence_dfs_transformer_handling_with_multi_output_primitive(
fast_mode,
df_with_url_and_email,
):
X = df_with_url_and_email
y = pd.Series(range(len(X)))
X.ww.name = "X"
X.ww.set_index("numeric")
X.ww.set_types(logical_types={"categorical": "NaturalLanguage"})

es = ft.EntitySet()
es = es.add_dataframe(
dataframe_name="X",
dataframe=X,
index="index",
make_index=True,
)
X_fm, features = ft.dfs(
entityset=es,
target_dataframe_name="X",
trans_primitives=["LSA"],
)

dfs_transformer = DFSTransformer(features=features)
pipeline = RegressionPipeline(
[dfs_transformer, "Standard Scaler", "Random Forest Regressor"],
)
# Confirm that a multi-output feature is present
assert any(f.number_output_features > 1 for f in features)

pipeline.fit(X_fm, y)
part_dep = partial_dependence(
pipeline,
X_fm,
features=0,
grid_resolution=2,
fast_mode=fast_mode,
X_train=X_fm,
y_train=y,
)

assert part_dep.feature_values.notnull().all()
assert part_dep.partial_dependence.notnull().all()


@pytest.mark.parametrize("fast_mode", [True, False])
def test_partial_dependence_dfs_transformer_target_in_features(fast_mode, X_y_binary):
X, y = X_y_binary
X = pd.DataFrame(X)
X.columns = X.columns.astype(str)

# Insert y into X so that it's part of the EntitySet
# and then ignore in DFS later on so it's not in X_fm
X["target"] = y

es = ft.EntitySet()
es = es.add_dataframe(
dataframe_name="X",
dataframe=X,
index="index",
make_index=True,
)
seed_features = [ft.Feature(es["X"].ww["target"])]
X_fm, features = ft.dfs(
entityset=es,
target_dataframe_name="X",
trans_primitives=["absolute"],
ignore_columns={"X": ["target"]},
seed_features=seed_features,
)
assert any(f.get_name() == "target" for f in features)

dfs_transformer = DFSTransformer(features=features)
pipeline = BinaryClassificationPipeline(
[dfs_transformer, "Standard Scaler", "Random Forest Classifier"],
)

pipeline.fit(X_fm, y)
part_dep = partial_dependence(
pipeline,
X_fm,
features=0,
grid_resolution=2,
fast_mode=fast_mode,
X_train=X_fm,
y_train=y,
)

assert part_dep.feature_values.notnull().all()
assert part_dep.partial_dependence.notnull().all()

0 comments on commit 459ba58

Please sign in to comment.