Skip to content

Commit ab793a6

Browse files
NeeratyoymfeurerPGijsbers
authored
Adding helper functions to support ColumnTransformer (#982)
* Adding importable helper functions * Changing import of cat, cont * Better docstrings * Adding unit test to check ColumnTransformer * Refinements from @mfeurer * Editing example to support both NumPy and Pandas * Unit test fix to mark for deletion * Making some unit tests work * Waiting for dataset to be processed * Minor test collection fix * Template to handle missing tasks * Accounting for more missing tasks: * Fixing some more unit tests * Simplifying check_task_existence * black changes * Minor formatting * Handling task exists check * Testing edited check task func * Flake fix * More retries on connection error * Adding max_retries to config default * Update database retry unit test * Print to debug hash exception * Fixing checksum unit test * Retry on _download_text_file * Update datasets_tutorial.py * Update custom_flow_tutorial.py * Update test_study_functions.py * Update test_dataset_functions.py * more retries, but also more time between retries * allow for even more retries on get calls * Catching failed get task * undo stupid change * fix one more test * Refactoring md5 hash check inside _send_request * Fixing a fairly common unit test fail * Reverting loose check on unit test * Fixing integer type check to allow np.integer * Trying to loosen check on unit test as fix * Examples support for pandas=1.2.1 * pandas indexing as iloc * fix example: actually load the different tasks * Renaming custom flow to disable tutorial (#1019) Co-authored-by: Matthias Feurer <[email protected]> Co-authored-by: PGijsbers <[email protected]>
1 parent e074c14 commit ab793a6

File tree

10 files changed

+156
-59
lines changed

10 files changed

+156
-59
lines changed
File renamed without changes.

examples/30_extended/flows_and_runs_tutorial.py

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
# License: BSD 3-Clause
99

1010
import openml
11+
import numpy as np
1112
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
1213

1314
############################################################################
@@ -83,33 +84,32 @@
8384
#
8485
# When you need to handle 'dirty' data, build pipelines to model then automatically.
8586
task = openml.tasks.get_task(1)
86-
features = task.get_dataset().features
87-
nominal_feature_indices = [
88-
i
89-
for i in range(len(features))
90-
if features[i].name != task.target_name and features[i].data_type == "nominal"
91-
]
87+
88+
# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
89+
from openml.extensions.sklearn import cat, cont
90+
9291
pipe = pipeline.Pipeline(
9392
steps=[
9493
(
9594
"Preprocessing",
9695
compose.ColumnTransformer(
9796
[
9897
(
99-
"Nominal",
98+
"categorical",
10099
pipeline.Pipeline(
101100
[
102101
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
103102
(
104103
"Encoder",
105104
preprocessing.OneHotEncoder(
106-
sparse=False, handle_unknown="ignore",
105+
sparse=False, handle_unknown="ignore"
107106
),
108107
),
109108
]
110109
),
111-
nominal_feature_indices,
110+
cat, # returns the categorical feature indices
112111
),
112+
("continuous", "passthrough", cont), # returns the numeric feature indices
113113
]
114114
),
115115
),
@@ -121,6 +121,56 @@
121121
myrun = run.publish()
122122
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
123123

124+
125+
# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
126+
# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
127+
# above pipeline is presented below to work with NumPy.
128+
129+
# Extracting the indices of the categorical columns
130+
features = task.get_dataset().features
131+
categorical_feature_indices = []
132+
numeric_feature_indices = []
133+
for i in range(len(features)):
134+
if features[i].name == task.target_name:
135+
continue
136+
if features[i].data_type == "nominal":
137+
categorical_feature_indices.append(i)
138+
else:
139+
numeric_feature_indices.append(i)
140+
141+
pipe = pipeline.Pipeline(
142+
steps=[
143+
(
144+
"Preprocessing",
145+
compose.ColumnTransformer(
146+
[
147+
(
148+
"categorical",
149+
pipeline.Pipeline(
150+
[
151+
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
152+
(
153+
"Encoder",
154+
preprocessing.OneHotEncoder(
155+
sparse=False, handle_unknown="ignore"
156+
),
157+
),
158+
]
159+
),
160+
categorical_feature_indices,
161+
),
162+
("continuous", "passthrough", numeric_feature_indices),
163+
]
164+
),
165+
),
166+
("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
167+
]
168+
)
169+
170+
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
171+
myrun = run.publish()
172+
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
173+
124174
###############################################################################
125175
# Running flows on tasks offline for later upload
126176
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

examples/30_extended/run_setup_tutorial.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434

3535
import numpy as np
3636
import openml
37+
from openml.extensions.sklearn import cat, cont
38+
3739
from sklearn.pipeline import make_pipeline, Pipeline
3840
from sklearn.compose import ColumnTransformer
3941
from sklearn.impute import SimpleImputer
@@ -57,15 +59,6 @@
5759
# easy as you want it to be
5860

5961

60-
# Helper functions to return required columns for ColumnTransformer
61-
def cont(X):
62-
return X.dtypes != "category"
63-
64-
65-
def cat(X):
66-
return X.dtypes == "category"
67-
68-
6962
cat_imp = make_pipeline(
7063
SimpleImputer(strategy="most_frequent"),
7164
OneHotEncoder(handle_unknown="ignore", sparse=False),

examples/30_extended/task_manual_iteration_tutorial.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,11 @@
6161
####################################################################################################
6262
# And then split the data based on this:
6363

64-
X, y, _, _ = task.get_dataset().get_data(task.target_name)
65-
X_train = X.loc[train_indices]
66-
y_train = y[train_indices]
67-
X_test = X.loc[test_indices]
68-
y_test = y[test_indices]
64+
X, y = task.get_X_and_y(dataset_format="dataframe")
65+
X_train = X.iloc[train_indices]
66+
y_train = y.iloc[train_indices]
67+
X_test = X.iloc[test_indices]
68+
y_test = y.iloc[test_indices]
6969

7070
print(
7171
"X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format(
@@ -78,6 +78,7 @@
7878

7979
task_id = 3
8080
task = openml.tasks.get_task(task_id)
81+
X, y = task.get_X_and_y(dataset_format="dataframe")
8182
n_repeats, n_folds, n_samples = task.get_split_dimensions()
8283
print(
8384
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -93,10 +94,10 @@
9394
train_indices, test_indices = task.get_train_test_split_indices(
9495
repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
9596
)
96-
X_train = X.loc[train_indices]
97-
y_train = y[train_indices]
98-
X_test = X.loc[test_indices]
99-
y_test = y[test_indices]
97+
X_train = X.iloc[train_indices]
98+
y_train = y.iloc[train_indices]
99+
X_test = X.iloc[test_indices]
100+
y_test = y.iloc[test_indices]
100101

101102
print(
102103
"Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
@@ -116,6 +117,7 @@
116117

117118
task_id = 1767
118119
task = openml.tasks.get_task(task_id)
120+
X, y = task.get_X_and_y(dataset_format="dataframe")
119121
n_repeats, n_folds, n_samples = task.get_split_dimensions()
120122
print(
121123
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -131,10 +133,10 @@
131133
train_indices, test_indices = task.get_train_test_split_indices(
132134
repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
133135
)
134-
X_train = X.loc[train_indices]
135-
y_train = y[train_indices]
136-
X_test = X.loc[test_indices]
137-
y_test = y[test_indices]
136+
X_train = X.iloc[train_indices]
137+
y_train = y.iloc[train_indices]
138+
X_test = X.iloc[test_indices]
139+
y_test = y.iloc[test_indices]
138140

139141
print(
140142
"Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
@@ -154,6 +156,7 @@
154156

155157
task_id = 1702
156158
task = openml.tasks.get_task(task_id)
159+
X, y = task.get_X_and_y(dataset_format="dataframe")
157160
n_repeats, n_folds, n_samples = task.get_split_dimensions()
158161
print(
159162
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -169,10 +172,10 @@
169172
train_indices, test_indices = task.get_train_test_split_indices(
170173
repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
171174
)
172-
X_train = X.loc[train_indices]
173-
y_train = y[train_indices]
174-
X_test = X.loc[test_indices]
175-
y_test = y[test_indices]
175+
X_train = X.iloc[train_indices]
176+
y_train = y.iloc[train_indices]
177+
X_test = X.iloc[test_indices]
178+
y_test = y.iloc[test_indices]
176179

177180
print(
178181
"Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "

openml/extensions/sklearn/__init__.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,31 @@
77
__all__ = ["SklearnExtension"]
88

99
register_extension(SklearnExtension)
10+
11+
12+
def cont(X):
13+
"""Returns True for all non-categorical columns, False for the rest.
14+
15+
This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
16+
of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
17+
required to process each type of columns separately.
18+
This function allows transformations meant for continuous/numeric columns to access the
19+
continuous/numeric columns given the dataset as DataFrame.
20+
"""
21+
if not hasattr(X, "dtypes"):
22+
raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
23+
return X.dtypes != "category"
24+
25+
26+
def cat(X):
27+
"""Returns True for all categorical columns, False for the rest.
28+
29+
This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
30+
of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
31+
required to process each type of columns separately.
32+
This function allows transformations meant for categorical columns to access the
33+
categorical columns given the dataset as DataFrame.
34+
"""
35+
if not hasattr(X, "dtypes"):
36+
raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
37+
return X.dtypes == "category"

openml/runs/functions.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import sklearn.metrics
1212
import xmltodict
13+
import numpy as np
1314
import pandas as pd
1415

1516
import openml
@@ -508,7 +509,9 @@ def _calculate_local_measure(sklearn_fn, openml_name):
508509
for i, tst_idx in enumerate(test_indices):
509510
if task.class_labels is not None:
510511
prediction = (
511-
task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i]
512+
task.class_labels[pred_y[i]]
513+
if isinstance(pred_y[i], (int, np.integer))
514+
else pred_y[i]
512515
)
513516
if isinstance(test_y, pd.Series):
514517
test_prediction = (
@@ -519,7 +522,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
519522
else:
520523
test_prediction = (
521524
task.class_labels[test_y[i]]
522-
if isinstance(test_y[i], int)
525+
if isinstance(test_y[i], (int, np.integer))
523526
else test_y[i]
524527
)
525528
pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]

openml/testing.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -318,12 +318,4 @@ class CustomImputer(SimpleImputer):
318318
pass
319319

320320

321-
def cont(X):
322-
return X.dtypes != "category"
323-
324-
325-
def cat(X):
326-
return X.dtypes == "category"
327-
328-
329-
__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
321+
__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"]

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@
4040
from openml.flows import OpenMLFlow
4141
from openml.flows.functions import assert_flows_equal
4242
from openml.runs.trace import OpenMLRunTrace
43-
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
43+
from openml.testing import TestBase, SimpleImputer, CustomImputer
44+
from openml.extensions.sklearn import cat, cont
4445

4546

4647
this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -2187,16 +2188,6 @@ def test_failed_serialization_of_custom_class(self):
21872188
# for lower versions
21882189
from sklearn.preprocessing import Imputer as SimpleImputer
21892190

2190-
class CustomImputer(SimpleImputer):
2191-
pass
2192-
2193-
def cont(X):
2194-
return X.dtypes != "category"
2195-
2196-
def cat(X):
2197-
return X.dtypes == "category"
2198-
2199-
import sklearn.metrics
22002191
import sklearn.tree
22012192
from sklearn.pipeline import Pipeline, make_pipeline
22022193
from sklearn.compose import ColumnTransformer
@@ -2219,3 +2210,38 @@ def cat(X):
22192210
raise AttributeError(e)
22202211
else:
22212212
raise Exception(e)
2213+
2214+
@unittest.skipIf(
2215+
LooseVersion(sklearn.__version__) < "0.20",
2216+
reason="columntransformer introduction in 0.20.0",
2217+
)
2218+
def test_setupid_with_column_transformer(self):
2219+
"""Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new
2220+
flow each time.
2221+
"""
2222+
import sklearn.compose
2223+
from sklearn.svm import SVC
2224+
2225+
def column_transformer_pipe(task_id):
2226+
task = openml.tasks.get_task(task_id)
2227+
# make columntransformer
2228+
preprocessor = sklearn.compose.ColumnTransformer(
2229+
transformers=[
2230+
("num", StandardScaler(), cont),
2231+
("cat", OneHotEncoder(handle_unknown="ignore"), cat),
2232+
]
2233+
)
2234+
# make pipeline
2235+
clf = SVC(gamma="scale", random_state=1)
2236+
pipe = make_pipeline(preprocessor, clf)
2237+
# run task
2238+
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
2239+
run.publish()
2240+
new_run = openml.runs.get_run(run.run_id)
2241+
return new_run
2242+
2243+
run1 = column_transformer_pipe(11) # only categorical
2244+
TestBase._mark_entity_for_removal("run", run1.run_id)
2245+
run2 = column_transformer_pipe(23) # only numeric
2246+
TestBase._mark_entity_for_removal("run", run2.run_id)
2247+
self.assertEqual(run1.setup_id, run2.setup_id)

tests/test_runs/test_run_functions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
import pandas as pd
2121

2222
import openml.extensions.sklearn
23-
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
23+
from openml.testing import TestBase, SimpleImputer, CustomImputer
24+
from openml.extensions.sklearn import cat, cont
2425
from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
2526
from openml.runs.trace import OpenMLRunTrace
2627
from openml.tasks import TaskType

tests/test_study/test_study_examples.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# License: BSD 3-Clause
22

3-
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
3+
from openml.testing import TestBase, SimpleImputer, CustomImputer
4+
from openml.extensions.sklearn import cat, cont
45

56
import sklearn
67
import unittest

0 commit comments

Comments
 (0)