diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_.py similarity index 100% rename from examples/30_extended/custom_flow_tutorial.py rename to examples/30_extended/custom_flow_.py diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 76eb2f219..5e73e7e9a 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -8,6 +8,7 @@ # License: BSD 3-Clause import openml +import numpy as np from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree ############################################################################ @@ -83,12 +84,10 @@ # # When you need to handle 'dirty' data, build pipelines to model then automatically. task = openml.tasks.get_task(1) -features = task.get_dataset().features -nominal_feature_indices = [ - i - for i in range(len(features)) - if features[i].name != task.target_name and features[i].data_type == "nominal" -] + +# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines +from openml.extensions.sklearn import cat, cont + pipe = pipeline.Pipeline( steps=[ ( @@ -96,20 +95,21 @@ compose.ColumnTransformer( [ ( - "Nominal", + "categorical", pipeline.Pipeline( [ ("Imputer", impute.SimpleImputer(strategy="most_frequent")), ( "Encoder", preprocessing.OneHotEncoder( - sparse=False, handle_unknown="ignore", + sparse=False, handle_unknown="ignore" ), ), ] ), - nominal_feature_indices, + cat, # returns the categorical feature indices ), + ("continuous", "passthrough", cont), # returns the numeric feature indices ] ), ), @@ -121,6 +121,56 @@ myrun = run.publish() print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) + +# The above pipeline works with the helper functions that internally deal with pandas DataFrame. +# In the case, pandas is not available, or a NumPy based data processing is the requirement, the +# above pipeline is presented below to work with NumPy. + +# Extracting the indices of the categorical columns +features = task.get_dataset().features +categorical_feature_indices = [] +numeric_feature_indices = [] +for i in range(len(features)): + if features[i].name == task.target_name: + continue + if features[i].data_type == "nominal": + categorical_feature_indices.append(i) + else: + numeric_feature_indices.append(i) + +pipe = pipeline.Pipeline( + steps=[ + ( + "Preprocessing", + compose.ColumnTransformer( + [ + ( + "categorical", + pipeline.Pipeline( + [ + ("Imputer", impute.SimpleImputer(strategy="most_frequent")), + ( + "Encoder", + preprocessing.OneHotEncoder( + sparse=False, handle_unknown="ignore" + ), + ), + ] + ), + categorical_feature_indices, + ), + ("continuous", "passthrough", numeric_feature_indices), + ] + ), + ), + ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)), + ] +) + +run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array") +myrun = run.publish() +print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) + ############################################################################### # Running flows on tasks offline for later upload # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index cea38e062..afc49a98b 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -34,6 +34,8 @@ import numpy as np import openml +from openml.extensions.sklearn import cat, cont + from sklearn.pipeline import make_pipeline, Pipeline from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer @@ -57,15 +59,6 @@ # easy as you want it to be -# Helper functions to return required columns for ColumnTransformer -def cont(X): - return X.dtypes != "category" - - -def cat(X): - return X.dtypes == "category" - - cat_imp = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore", sparse=False), diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py index c879e9fea..533f645b2 100644 --- a/examples/30_extended/task_manual_iteration_tutorial.py +++ b/examples/30_extended/task_manual_iteration_tutorial.py @@ -61,11 +61,11 @@ #################################################################################################### # And then split the data based on this: -X, y, _, _ = task.get_dataset().get_data(task.target_name) -X_train = X.loc[train_indices] -y_train = y[train_indices] -X_test = X.loc[test_indices] -y_test = y[test_indices] +X, y = task.get_X_and_y(dataset_format="dataframe") +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] print( "X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format( @@ -78,6 +78,7 @@ task_id = 3 task = openml.tasks.get_task(task_id) +X, y = task.get_X_and_y(dataset_format="dataframe") n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -93,10 +94,10 @@ train_indices, test_indices = task.get_train_test_split_indices( repeat=repeat_idx, fold=fold_idx, sample=sample_idx, ) - X_train = X.loc[train_indices] - y_train = y[train_indices] - X_test = X.loc[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] print( "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " @@ -116,6 +117,7 @@ task_id = 1767 task = openml.tasks.get_task(task_id) +X, y = task.get_X_and_y(dataset_format="dataframe") n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -131,10 +133,10 @@ train_indices, test_indices = task.get_train_test_split_indices( repeat=repeat_idx, fold=fold_idx, sample=sample_idx, ) - X_train = X.loc[train_indices] - y_train = y[train_indices] - X_test = X.loc[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] print( "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " @@ -154,6 +156,7 @@ task_id = 1702 task = openml.tasks.get_task(task_id) +X, y = task.get_X_and_y(dataset_format="dataframe") n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -169,10 +172,10 @@ train_indices, test_indices = task.get_train_test_split_indices( repeat=repeat_idx, fold=fold_idx, sample=sample_idx, ) - X_train = X.loc[train_indices] - y_train = y[train_indices] - X_test = X.loc[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] print( "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py index 2003934db..135e5ccf6 100644 --- a/openml/extensions/sklearn/__init__.py +++ b/openml/extensions/sklearn/__init__.py @@ -7,3 +7,31 @@ __all__ = ["SklearnExtension"] register_extension(SklearnExtension) + + +def cont(X): + """Returns True for all non-categorical columns, False for the rest. + + This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling + of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is + required to process each type of columns separately. + This function allows transformations meant for continuous/numeric columns to access the + continuous/numeric columns given the dataset as DataFrame. + """ + if not hasattr(X, "dtypes"): + raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!") + return X.dtypes != "category" + + +def cat(X): + """Returns True for all categorical columns, False for the rest. + + This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling + of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is + required to process each type of columns separately. + This function allows transformations meant for categorical columns to access the + categorical columns given the dataset as DataFrame. + """ + if not hasattr(X, "dtypes"): + raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!") + return X.dtypes == "category" diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 194e4b598..89b811d10 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -10,6 +10,7 @@ import sklearn.metrics import xmltodict +import numpy as np import pandas as pd import openml @@ -508,7 +509,9 @@ def _calculate_local_measure(sklearn_fn, openml_name): for i, tst_idx in enumerate(test_indices): if task.class_labels is not None: prediction = ( - task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i] + task.class_labels[pred_y[i]] + if isinstance(pred_y[i], (int, np.integer)) + else pred_y[i] ) if isinstance(test_y, pd.Series): test_prediction = ( @@ -519,7 +522,7 @@ def _calculate_local_measure(sklearn_fn, openml_name): else: test_prediction = ( task.class_labels[test_y[i]] - if isinstance(test_y[i], int) + if isinstance(test_y[i], (int, np.integer)) else test_y[i] ) pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i] diff --git a/openml/testing.py b/openml/testing.py index bbb8d5f88..31bd87b9a 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -318,12 +318,4 @@ class CustomImputer(SimpleImputer): pass -def cont(X): - return X.dtypes != "category" - - -def cat(X): - return X.dtypes == "category" - - -__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"] +__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"] diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 8d7857bc2..8ca6f9d45 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -40,7 +40,8 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont +from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.extensions.sklearn import cat, cont this_directory = os.path.dirname(os.path.abspath(__file__)) @@ -2187,16 +2188,6 @@ def test_failed_serialization_of_custom_class(self): # for lower versions from sklearn.preprocessing import Imputer as SimpleImputer - class CustomImputer(SimpleImputer): - pass - - def cont(X): - return X.dtypes != "category" - - def cat(X): - return X.dtypes == "category" - - import sklearn.metrics import sklearn.tree from sklearn.pipeline import Pipeline, make_pipeline from sklearn.compose import ColumnTransformer @@ -2219,3 +2210,38 @@ def cat(X): raise AttributeError(e) else: raise Exception(e) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) + def test_setupid_with_column_transformer(self): + """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new + flow each time. + """ + import sklearn.compose + from sklearn.svm import SVC + + def column_transformer_pipe(task_id): + task = openml.tasks.get_task(task_id) + # make columntransformer + preprocessor = sklearn.compose.ColumnTransformer( + transformers=[ + ("num", StandardScaler(), cont), + ("cat", OneHotEncoder(handle_unknown="ignore"), cat), + ] + ) + # make pipeline + clf = SVC(gamma="scale", random_state=1) + pipe = make_pipeline(preprocessor, clf) + # run task + run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) + run.publish() + new_run = openml.runs.get_run(run.run_id) + return new_run + + run1 = column_transformer_pipe(11) # only categorical + TestBase._mark_entity_for_removal("run", run1.run_id) + run2 = column_transformer_pipe(23) # only numeric + TestBase._mark_entity_for_removal("run", run2.run_id) + self.assertEqual(run1.setup_id, run2.setup_id) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 500c4063d..e7c0c06fc 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -20,7 +20,8 @@ import pandas as pd import openml.extensions.sklearn -from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont +from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.extensions.sklearn import cat, cont from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction from openml.runs.trace import OpenMLRunTrace from openml.tasks import TaskType diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index fdb2747ec..e2a228aee 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,6 +1,7 @@ # License: BSD 3-Clause -from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont +from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.extensions.sklearn import cat, cont import sklearn import unittest