Adding helper functions to support ColumnTransformer (#982)

Neeratyoy · mfeurer · PGijsbers · web-flow · commit ab793a65efe4 · 2021-01-28T11:58:22.000+01:00
* Adding importable helper functions * Changing import of cat, cont * Better docstrings * Adding unit test to check ColumnTransformer * Refinements from @mfeurer * Editing example to support both NumPy and Pandas * Unit test fix to mark for deletion * Making some unit tests work * Waiting for dataset to be processed * Minor test collection fix * Template to handle missing tasks * Accounting for more missing tasks: * Fixing some more unit tests * Simplifying check_task_existence * black changes * Minor formatting * Handling task exists check * Testing edited check task func * Flake fix * More retries on connection error * Adding max_retries to config default * Update database retry unit test * Print to debug hash exception * Fixing checksum unit test * Retry on _download_text_file * Update datasets_tutorial.py * Update custom_flow_tutorial.py * Update test_study_functions.py * Update test_dataset_functions.py * more retries, but also more time between retries * allow for even more retries on get calls * Catching failed get task * undo stupid change * fix one more test * Refactoring md5 hash check inside _send_request * Fixing a fairly common unit test fail * Reverting loose check on unit test * Fixing integer type check to allow np.integer * Trying to loosen check on unit test as fix * Examples support for pandas=1.2.1 * pandas indexing as iloc * fix example: actually load the different tasks * Renaming custom flow to disable tutorial (#1019) Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de> Co-authored-by: PGijsbers <p.gijsbers@tue.nl>
diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -8,6 +8,7 @@
 # License: BSD 3-Clause
 
 import openml
+import numpy as np
 from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
@@ -83,33 +84,32 @@
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
 task = openml.tasks.get_task(1)
-features = task.get_dataset().features
-nominal_feature_indices = [
-    i
-    for i in range(len(features))
-    if features[i].name != task.target_name and features[i].data_type == "nominal"
-]
+
+# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
+from openml.extensions.sklearn import cat, cont
+
 pipe = pipeline.Pipeline(
     steps=[
         (
             "Preprocessing",
             compose.ColumnTransformer(
                 [
                     (
-                        "Nominal",
+                        "categorical",
                         pipeline.Pipeline(
                             [
                                 ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
                                 (
                                     "Encoder",
                                     preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore",
+                                        sparse=False, handle_unknown="ignore"
                                     ),
                                 ),
                             ]
                         ),
-                        nominal_feature_indices,
+                        cat,  # returns the categorical feature indices
                     ),
+                    ("continuous", "passthrough", cont),  # returns the numeric feature indices
                 ]
             ),
         ),
@@ -121,6 +121,56 @@
 myrun = run.publish()
 print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
 
+
+# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
+# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
+# above pipeline is presented below to work with NumPy.
+
+# Extracting the indices of the categorical columns
+features = task.get_dataset().features
+categorical_feature_indices = []
+numeric_feature_indices = []
+for i in range(len(features)):
+    if features[i].name == task.target_name:
+        continue
+    if features[i].data_type == "nominal":
+        categorical_feature_indices.append(i)
+    else:
+        numeric_feature_indices.append(i)
+
+pipe = pipeline.Pipeline(
+    steps=[
+        (
+            "Preprocessing",
+            compose.ColumnTransformer(
+                [
+                    (
+                        "categorical",
+                        pipeline.Pipeline(
+                            [
+                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
+                                (
+                                    "Encoder",
+                                    preprocessing.OneHotEncoder(
+                                        sparse=False, handle_unknown="ignore"
+                                    ),
+                                ),
+                            ]
+                        ),
+                        categorical_feature_indices,
+                    ),
+                    ("continuous", "passthrough", numeric_feature_indices),
+                ]
+            ),
+        ),
+        ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
+    ]
+)
+
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
+myrun = run.publish()
+print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
+
 ###############################################################################
 # Running flows on tasks offline for later upload
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -34,6 +34,8 @@
 
 import numpy as np
 import openml
+from openml.extensions.sklearn import cat, cont
+
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
@@ -57,15 +59,6 @@
 # easy as you want it to be
 
 
-# Helper functions to return required columns for ColumnTransformer
-def cont(X):
-    return X.dtypes != "category"
-
-
-def cat(X):
-    return X.dtypes == "category"
-
-
 cat_imp = make_pipeline(
     SimpleImputer(strategy="most_frequent"),
     OneHotEncoder(handle_unknown="ignore", sparse=False),
diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -61,11 +61,11 @@
 ####################################################################################################
 # And then split the data based on this:
 
-X, y, _, _ = task.get_dataset().get_data(task.target_name)
-X_train = X.loc[train_indices]
-y_train = y[train_indices]
-X_test = X.loc[test_indices]
-y_test = y[test_indices]
+X, y = task.get_X_and_y(dataset_format="dataframe")
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
 
 print(
     "X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format(
@@ -78,6 +78,7 @@
 
 task_id = 3
 task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y(dataset_format="dataframe")
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -93,10 +94,10 @@
             train_indices, test_indices = task.get_train_test_split_indices(
                 repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
             )
-            X_train = X.loc[train_indices]
-            y_train = y[train_indices]
-            X_test = X.loc[test_indices]
-            y_test = y[test_indices]
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
+            y_test = y.iloc[test_indices]
 
             print(
                 "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
@@ -116,6 +117,7 @@
 
 task_id = 1767
 task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y(dataset_format="dataframe")
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -131,10 +133,10 @@
             train_indices, test_indices = task.get_train_test_split_indices(
                 repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
             )
-            X_train = X.loc[train_indices]
-            y_train = y[train_indices]
-            X_test = X.loc[test_indices]
-            y_test = y[test_indices]
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
+            y_test = y.iloc[test_indices]
 
             print(
                 "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
@@ -154,6 +156,7 @@
 
 task_id = 1702
 task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y(dataset_format="dataframe")
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -169,10 +172,10 @@
             train_indices, test_indices = task.get_train_test_split_indices(
                 repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
             )
-            X_train = X.loc[train_indices]
-            y_train = y[train_indices]
-            X_test = X.loc[test_indices]
-            y_test = y[test_indices]
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
+            y_test = y.iloc[test_indices]
 
             print(
                 "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py
@@ -7,3 +7,31 @@
 __all__ = ["SklearnExtension"]
 
 register_extension(SklearnExtension)
+
+
+def cont(X):
+    """Returns True for all non-categorical columns, False for the rest.
+
+    This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
+    of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
+    required to process each type of columns separately.
+    This function allows transformations meant for continuous/numeric columns to access the
+    continuous/numeric columns given the dataset as DataFrame.
+    """
+    if not hasattr(X, "dtypes"):
+        raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
+    return X.dtypes != "category"
+
+
+def cat(X):
+    """Returns True for all categorical columns, False for the rest.
+
+    This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
+    of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
+    required to process each type of columns separately.
+    This function allows transformations meant for categorical columns to access the
+    categorical columns given the dataset as DataFrame.
+    """
+    if not hasattr(X, "dtypes"):
+        raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
+    return X.dtypes == "category"
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -10,6 +10,7 @@
 
 import sklearn.metrics
 import xmltodict
+import numpy as np
 import pandas as pd
 
 import openml
@@ -508,7 +509,9 @@ def _calculate_local_measure(sklearn_fn, openml_name):
             for i, tst_idx in enumerate(test_indices):
                 if task.class_labels is not None:
                     prediction = (
-                        task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i]
+                        task.class_labels[pred_y[i]]
+                        if isinstance(pred_y[i], (int, np.integer))
+                        else pred_y[i]
                     )
                     if isinstance(test_y, pd.Series):
                         test_prediction = (
@@ -519,7 +522,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                     else:
                         test_prediction = (
                             task.class_labels[test_y[i]]
-                            if isinstance(test_y[i], int)
+                            if isinstance(test_y[i], (int, np.integer))
                             else test_y[i]
                         )
                     pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]
diff --git a/openml/testing.py b/openml/testing.py
@@ -318,12 +318,4 @@ class CustomImputer(SimpleImputer):
     pass
 
 
-def cont(X):
-    return X.dtypes != "category"
-
-
-def cat(X):
-    return X.dtypes == "category"
-
-
-__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
+__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"]
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -40,7 +40,8 @@
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
 
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -2187,16 +2188,6 @@ def test_failed_serialization_of_custom_class(self):
             # for lower versions
             from sklearn.preprocessing import Imputer as SimpleImputer
 
-        class CustomImputer(SimpleImputer):
-            pass
-
-        def cont(X):
-            return X.dtypes != "category"
-
-        def cat(X):
-            return X.dtypes == "category"
-
-        import sklearn.metrics
         import sklearn.tree
         from sklearn.pipeline import Pipeline, make_pipeline
         from sklearn.compose import ColumnTransformer
@@ -2219,3 +2210,38 @@ def cat(X):
                 raise AttributeError(e)
             else:
                 raise Exception(e)
+
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
+    def test_setupid_with_column_transformer(self):
+        """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new
+        flow each time.
+        """
+        import sklearn.compose
+        from sklearn.svm import SVC
+
+        def column_transformer_pipe(task_id):
+            task = openml.tasks.get_task(task_id)
+            # make columntransformer
+            preprocessor = sklearn.compose.ColumnTransformer(
+                transformers=[
+                    ("num", StandardScaler(), cont),
+                    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
+                ]
+            )
+            # make pipeline
+            clf = SVC(gamma="scale", random_state=1)
+            pipe = make_pipeline(preprocessor, clf)
+            # run task
+            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
+            run.publish()
+            new_run = openml.runs.get_run(run.run_id)
+            return new_run
+
+        run1 = column_transformer_pipe(11)  # only categorical
+        TestBase._mark_entity_for_removal("run", run1.run_id)
+        run2 = column_transformer_pipe(23)  # only numeric
+        TestBase._mark_entity_for_removal("run", run2.run_id)
+        self.assertEqual(run1.setup_id, run2.setup_id)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -20,7 +20,8 @@
 import pandas as pd
 
 import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
 from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskType
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
@@ -1,6 +1,7 @@
 # License: BSD 3-Clause
 
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
 
 import sklearn
 import unittest