Fix partial dependence bugs with dfs transfomer (#3830)

* Allow engineered features with dfs transfomer to have pd calcualtions * Add release note * PR comments * Allow multi output features to be handled correctly by dfs transformer * Pass pipeline target into fast mode component handlers * Add tests for the second two bug fixes * clean up * PR comments * PR comments
alteryx · Nov 18, 2022 · 459ba58 · 459ba58
1 parent b9ca2b6
commit 459ba58
Show file tree

Hide file tree

Showing 7 changed files with 178 additions and 14 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -10,6 +10,9 @@ Release Notes
         * Updated demo dataset links to point to new endpoint :pr:`3826`
         * Updated ``STLDecomposer`` to infer the time index frequency if it's not present :pr:`3829`
         * Updated ``_drop_time_index`` to move the time index from X to both ``X.index`` and ``y.index`` :pr:`3829`
+        * Fixed bug where engineered features lost their origin attribute in partial dependence, causing it to fail :pr:`3830`
+        * Fixed bug where partial dependence's fast mode handling for the DFS Transformer wouldn't work with multi output features :pr:`3830`
+        * Allowed target to be present and ignored in partial dependence's DFS Transformer fast mode handling  :pr:`3830`
     * Changes
         * Consolidated decomposition frequency validation logic to ``Decomposer`` class :pr:`3811`
         * Removed Featuretools version upper bound and prevent Woodwork 0.20.0 from being installed :pr:`3813`

diff --git a/evalml/model_understanding/_partial_dependence_fast_mode_utils.py b/evalml/model_understanding/_partial_dependence_fast_mode_utils.py
@@ -35,8 +35,9 @@ def _get_cloned_feature_pipelines(
     new_parameters = pipeline.parameters
     for component in pipeline.component_graph.component_instances.values():
         new_parameters = component._handle_partial_dependence_fast_mode(
-            X_train,
             new_parameters,
+            X=X_train,
+            target=pipeline.input_target_name,
         )
 
     # Create a fit pipeline for each feature
@@ -77,9 +78,7 @@ def _transform_single_feature(
             fit for it.
     """
     changed_col_df = pd.DataFrame({variable: part_dep_column})
-    changed_col_df.ww.init(
-        logical_types={variable: X.ww.logical_types[variable]},
-    )
+    changed_col_df.ww.init(schema=X.ww.schema.get_subset_schema([variable]))
 
     # Take the changed column and send it through transform by itself
     X_t_single_col = cloned_pipeline.transform_all_but_final(changed_col_df)

diff --git a/evalml/model_understanding/_partial_dependence_utils.py b/evalml/model_understanding/_partial_dependence_utils.py
@@ -318,8 +318,8 @@ def _partial_dependence_calculation(
                     X_eval.ww[variable] = ww.init_series(
                         part_dep_column,
                         logical_type=X_eval.ww.logical_types[variable],
+                        origin=X_eval.ww.columns[variable].origin,
                     )
-
             pred = prediction_method(X_eval)
             predictions.append(pred)
             # average over samples

diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
@@ -103,13 +103,19 @@ def default_parameters(cls):
     def _supported_by_list_API(cls):
         return not cls.modifies_target
 
-    def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
+    def _handle_partial_dependence_fast_mode(
+        self,
+        pipeline_parameters,
+        X=None,
+        target=None,
+    ):
         """Determines whether or not a component can be used with partial dependence's fast mode.
 
         Args:
-            X (pd.DataFrame): Holdout data being used for partial dependence calculations.
             pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
                 used in partial dependence fast mode.
+            X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations.
+            target (str, optional): The target whose values we are trying to predict.
         """
         if self._can_be_used_for_fast_partial_dependence:
             return pipeline_parameters

diff --git a/evalml/pipelines/components/transformers/feature_selection/feature_selector.py b/evalml/pipelines/components/transformers/feature_selection/feature_selector.py
@@ -71,7 +71,12 @@ def fit_transform(self, X, y=None):
         """
         return self.fit(X, y).transform(X, y)
 
-    def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
+    def _handle_partial_dependence_fast_mode(
+        self,
+        pipeline_parameters,
+        X=None,
+        target=None,
+    ):
         """Updates pipeline parameters to not drop any features based off of feature importance.
 
             This is needed, because fast mode refits cloned pipelines on single columns,
@@ -81,9 +86,10 @@ def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
             pipeline to determine if that feature gets dropped or not.
 
         Args:
-            X (pd.DataFrame): Holdout data being used for partial dependence calculations.
             pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
                 used in partial dependence fast mode.
+            X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations.
+            target (str, optional): The target whose values we are trying to predict.
 
         Return:
             pipeline_parameters (dict): Pipeline parameters updated to allow the FeatureSelector component

diff --git a/evalml/pipelines/components/transformers/preprocessing/featuretools.py b/evalml/pipelines/components/transformers/preprocessing/featuretools.py
@@ -129,26 +129,36 @@ def transform(self, X, y=None):
         feature_matrix.ww.init(schema=partial_schema)
         return feature_matrix
 
-    def _handle_partial_dependence_fast_mode(self, X, pipeline_parameters):
-        """Determines whether or not a DFSTransformer component can be used with partial dependence's fast mode.
+    def _handle_partial_dependence_fast_mode(self, pipeline_parameters, X, target):
+        """Determines whether or not a DFS Transformer component can be used with partial dependence's fast mode.
 
         Note:
             This component can be used with partial dependence fast mode only when
             all of the features present in the ``features`` parameter are present
             in the DataFrame.
 
         Args:
-            X (pd.DataFrame): Holdout data being used for partial dependence calculations.
             pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
                 used in partial dependence fast mode.
-
+            X (pd.DataFrame): Holdout data being used for partial dependence calculations.
+            target (str): The target whose values we are trying to predict. This is used
+                to know which column to ignore if the target column is present in the list of features
+                in the DFS Transformer's parameters
         """
         dfs_transformer = pipeline_parameters.get("DFS Transformer")
         if dfs_transformer is not None:
             dfs_features = dfs_transformer["features"]
+            # remove the target if it's there
+            dfs_feature_names = [
+                name
+                for feature in dfs_features
+                for name in feature.get_feature_names()
+                if name != target
+            ]
             X_cols = set(X.columns)
+
             if dfs_features is None or any(
-                f.get_name() not in X_cols for f in dfs_features
+                name not in X_cols for name in dfs_feature_names
             ):
                 raise ValueError(
                     "Cannot use fast mode with DFS Transformer when features are unspecified or not all present in X.",

diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -2794,3 +2794,143 @@ def test_partial_dependence_fast_mode_errors_if_train(
             fast_mode=True,
             y_train=y,
         )
+
+
+@pytest.mark.parametrize("fast_mode", [True, False])
+def test_partial_dependence_on_engineered_feature_with_dfs_transformer(
+    fast_mode,
+    X_y_binary,
+):
+    X, y = X_y_binary
+    X = pd.DataFrame(X)
+    X.columns = X.columns.astype(str)
+
+    es = ft.EntitySet()
+    es = es.add_dataframe(
+        dataframe_name="X",
+        dataframe=X,
+        index="index",
+        make_index=True,
+    )
+    X_fm, features = ft.dfs(
+        entityset=es,
+        target_dataframe_name="X",
+        trans_primitives=["absolute"],
+    )
+
+    dfs_transformer = DFSTransformer(features=features)
+    pipeline = BinaryClassificationPipeline(
+        [dfs_transformer, "Standard Scaler", "Random Forest Classifier"],
+    )
+
+    # Engineered features have the their origins specified as either "base" or "engineered"
+    # it has to remain set for partial dependence to be able to predict on the updated data
+    engineered_feature = "ABSOLUTE(1)"
+    assert X_fm.ww.columns[engineered_feature].origin == "engineered"
+
+    pipeline.fit(X_fm, y)
+    part_dep = partial_dependence(
+        pipeline,
+        X_fm,
+        features=engineered_feature,
+        grid_resolution=2,
+        fast_mode=fast_mode,
+        X_train=X_fm,
+        y_train=y,
+    )
+
+    assert part_dep.feature_values.notnull().all()
+    assert part_dep.partial_dependence.notnull().all()
+
+
+@pytest.mark.parametrize("fast_mode", [True, False])
+def test_partial_dependence_dfs_transformer_handling_with_multi_output_primitive(
+    fast_mode,
+    df_with_url_and_email,
+):
+    X = df_with_url_and_email
+    y = pd.Series(range(len(X)))
+    X.ww.name = "X"
+    X.ww.set_index("numeric")
+    X.ww.set_types(logical_types={"categorical": "NaturalLanguage"})
+
+    es = ft.EntitySet()
+    es = es.add_dataframe(
+        dataframe_name="X",
+        dataframe=X,
+        index="index",
+        make_index=True,
+    )
+    X_fm, features = ft.dfs(
+        entityset=es,
+        target_dataframe_name="X",
+        trans_primitives=["LSA"],
+    )
+
+    dfs_transformer = DFSTransformer(features=features)
+    pipeline = RegressionPipeline(
+        [dfs_transformer, "Standard Scaler", "Random Forest Regressor"],
+    )
+    # Confirm that a multi-output feature is present
+    assert any(f.number_output_features > 1 for f in features)
+
+    pipeline.fit(X_fm, y)
+    part_dep = partial_dependence(
+        pipeline,
+        X_fm,
+        features=0,
+        grid_resolution=2,
+        fast_mode=fast_mode,
+        X_train=X_fm,
+        y_train=y,
+    )
+
+    assert part_dep.feature_values.notnull().all()
+    assert part_dep.partial_dependence.notnull().all()
+
+
+@pytest.mark.parametrize("fast_mode", [True, False])
+def test_partial_dependence_dfs_transformer_target_in_features(fast_mode, X_y_binary):
+    X, y = X_y_binary
+    X = pd.DataFrame(X)
+    X.columns = X.columns.astype(str)
+
+    # Insert y into X so that it's part of the EntitySet
+    # and then ignore in DFS later on so it's not in X_fm
+    X["target"] = y
+
+    es = ft.EntitySet()
+    es = es.add_dataframe(
+        dataframe_name="X",
+        dataframe=X,
+        index="index",
+        make_index=True,
+    )
+    seed_features = [ft.Feature(es["X"].ww["target"])]
+    X_fm, features = ft.dfs(
+        entityset=es,
+        target_dataframe_name="X",
+        trans_primitives=["absolute"],
+        ignore_columns={"X": ["target"]},
+        seed_features=seed_features,
+    )
+    assert any(f.get_name() == "target" for f in features)
+
+    dfs_transformer = DFSTransformer(features=features)
+    pipeline = BinaryClassificationPipeline(
+        [dfs_transformer, "Standard Scaler", "Random Forest Classifier"],
+    )
+
+    pipeline.fit(X_fm, y)
+    part_dep = partial_dependence(
+        pipeline,
+        X_fm,
+        features=0,
+        grid_resolution=2,
+        fast_mode=fast_mode,
+        X_train=X_fm,
+        y_train=y,
+    )
+
+    assert part_dep.feature_values.notnull().all()
+    assert part_dep.partial_dependence.notnull().all()