diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index cc3352a20..f1a25cbbe 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -694,10 +694,14 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: # will be part of the name (in brackets) sub_components_names = "" for key in subcomponents: + if isinstance(subcomponents[key], OpenMLFlow): + name = subcomponents[key].name + elif isinstance(subcomponents[key], str): # 'drop', 'passthrough' can be passed + name = subcomponents[key] if key in subcomponents_explicit: - sub_components_names += "," + key + "=" + subcomponents[key].name + sub_components_names += "," + key + "=" + name else: - sub_components_names += "," + subcomponents[key].name + sub_components_names += "," + name if sub_components_names: # slice operation on string in order to get rid of leading comma @@ -769,6 +773,9 @@ def _get_external_version_string( external_versions.add(openml_version) external_versions.add(sklearn_version) for visitee in sub_components.values(): + # 'drop', 'passthrough', None can be passed as estimators + if isinstance(visitee, str): + continue for external_version in visitee.external_version.split(','): external_versions.add(external_version) return ','.join(list(sorted(external_versions))) @@ -781,9 +788,12 @@ def _check_multiple_occurence_of_component_in_flow( to_visit_stack = [] # type: List[OpenMLFlow] to_visit_stack.extend(sub_components.values()) known_sub_components = set() # type: Set[str] + while len(to_visit_stack) > 0: visitee = to_visit_stack.pop() - if visitee.name in known_sub_components: + if isinstance(visitee, str): # 'drop', 'passthrough' can be passed as estimators + known_sub_components.add(visitee) + elif visitee.name in known_sub_components: raise ValueError('Found a second occurence of component %s when ' 'trying to serialize %s.' % (visitee.name, model)) else: @@ -820,7 +830,7 @@ def _extract_information_from_model( def flatten_all(list_): """ Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """ for el in list_: - if isinstance(el, (list, tuple)): + if isinstance(el, (list, tuple)) and len(el) > 0: yield from flatten_all(el) else: yield el @@ -850,17 +860,31 @@ def flatten_all(list_): parameter_value = list() # type: List reserved_keywords = set(model.get_params(deep=False).keys()) - for sub_component_tuple in rval: + for i, sub_component_tuple in enumerate(rval): identifier = sub_component_tuple[0] sub_component = sub_component_tuple[1] - sub_component_type = type(sub_component_tuple) + # sub_component_type = type(sub_component_tuple) if not 2 <= len(sub_component_tuple) <= 3: # length 2 is for {VotingClassifier.estimators, # Pipeline.steps, FeatureUnion.transformer_list} # length 3 is for ColumnTransformer msg = 'Length of tuple does not match assumptions' raise ValueError(msg) - if not isinstance(sub_component, (OpenMLFlow, type(None))): + + if isinstance(sub_component, str): + if sub_component != 'drop' and sub_component != 'passthrough': + msg = 'Second item of tuple does not match assumptions. ' \ + 'If string, can be only \'drop\' or \'passthrough\' but' \ + 'got %s' % sub_component + raise ValueError(msg) + else: + pass + elif isinstance(sub_component, type(None)): + msg = 'Cannot serialize objects of None type. Please use a valid ' \ + 'placeholder for None. Note that empty sklearn estimators can be '\ + 'replaced with \'drop\' or \'passthrough\'.' + raise ValueError(msg) + elif not isinstance(sub_component, OpenMLFlow): msg = 'Second item of tuple does not match assumptions. ' \ 'Expected OpenMLFlow, got %s' % type(sub_component) raise TypeError(msg) @@ -873,31 +897,18 @@ def flatten_all(list_): identifier) raise PyOpenMLError(msg) - if sub_component is None: - # In a FeatureUnion it is legal to have a None step - - pv = [identifier, None] - if sub_component_type is tuple: - parameter_value.append(tuple(pv)) - else: - parameter_value.append(pv) - - else: - # Add the component to the list of components, add a - # component reference as a placeholder to the list of - # parameters, which will be replaced by the real component - # when deserializing the parameter - sub_components_explicit.add(identifier) - sub_components[identifier] = sub_component - component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] - component_reference['oml-python:serialized_object'] = 'component_reference' - cr_value = OrderedDict() # type: Dict[str, Any] - cr_value['key'] = identifier - cr_value['step_name'] = identifier - if len(sub_component_tuple) == 3: - cr_value['argument_1'] = sub_component_tuple[2] - component_reference['value'] = cr_value - parameter_value.append(component_reference) + # when deserializing the parameter + sub_components_explicit.add(identifier) + sub_components[identifier] = sub_component + component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] + component_reference['oml-python:serialized_object'] = 'component_reference' + cr_value = OrderedDict() # type: Dict[str, Any] + cr_value['key'] = identifier + cr_value['step_name'] = identifier + if len(sub_component_tuple) == 3: + cr_value['argument_1'] = sub_component_tuple[2] + component_reference['value'] = cr_value + parameter_value.append(component_reference) # Here (and in the elif and else branch below) are the only # places where we encode a value as json to make sure that all diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index a93c79bcd..3ab9d8936 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -28,7 +28,8 @@ import sklearn.preprocessing import sklearn.tree import sklearn.cluster - +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler import openml from openml.extensions.sklearn import SklearnExtension @@ -607,6 +608,8 @@ def test_serialize_column_transformer_pipeline(self): serialization2 = self.extension.model_to_flow(new_model) assert_flows_equal(serialization, serialization2) + @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20", + reason="Pipeline processing behaviour updated") def test_serialize_feature_union(self): ohe_params = {'sparse': False} if LooseVersion(sklearn.__version__) >= "0.20": @@ -673,16 +676,17 @@ def test_serialize_feature_union(self): self.assertEqual(new_model_params, fu_params) new_model.fit(self.X, self.y) - fu.set_params(scaler=None) + fu.set_params(scaler='drop') serialization = self.extension.model_to_flow(fu) self.assertEqual(serialization.name, 'sklearn.pipeline.FeatureUnion(' - 'ohe=sklearn.preprocessing.{}.OneHotEncoder)' + 'ohe=sklearn.preprocessing.{}.OneHotEncoder,' + 'scaler=drop)' .format(module_name_encoder)) new_model = self.extension.flow_to_model(serialization) self.assertEqual(type(new_model), type(fu)) self.assertIsNot(new_model, fu) - self.assertIs(new_model.transformer_list[1][1], None) + self.assertIs(new_model.transformer_list[1][1], 'drop') def test_serialize_feature_union_switched_names(self): ohe_params = ({'categories': 'auto'} @@ -1776,3 +1780,66 @@ def test_trim_flow_name(self): self.assertEqual("weka.IsolationForest", SklearnExtension.trim_flow_name("weka.IsolationForest")) + + @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21", + reason="SimpleImputer, ColumnTransformer available only after 0.19 and " + "Pipeline till 0.20 doesn't support indexing and 'passthrough'") + def test_run_on_model_with_empty_steps(self): + from sklearn.compose import ColumnTransformer + # testing 'drop', 'passthrough', None as non-actionable sklearn estimators + dataset = openml.datasets.get_dataset(128) + task = openml.tasks.get_task(59) + + X, y, categorical_ind, feature_names = dataset.get_data( + target=dataset.default_target_attribute, dataset_format='array') + categorical_ind = np.array(categorical_ind) + cat_idx, = np.where(categorical_ind) + cont_idx, = np.where(~categorical_ind) + + clf = make_pipeline( + ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'), + OneHotEncoder()), cat_idx.tolist()), + ('cont', make_pipeline(SimpleImputer(strategy='median'), + StandardScaler()), cont_idx.tolist())]) + ) + + clf = sklearn.pipeline.Pipeline([ + ('dummystep', 'passthrough'), # adding 'passthrough' as an estimator + ('prep', clf), + ('classifier', sklearn.svm.SVC(gamma='auto')) + ]) + + # adding 'drop' to a ColumnTransformer + if not categorical_ind.any(): + clf[1][0].set_params(cat='drop') + if not (~categorical_ind).any(): + clf[1][0].set_params(cont='drop') + + # serializing model with non-actionable step + run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) + + self.assertEqual(len(flow.components), 3) + self.assertEqual(flow.components['dummystep'], 'passthrough') + self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow)) + self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow)) + self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'], + OpenMLFlow)) + self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'], + 'drop') + + # de-serializing flow to a model with non-actionable step + model = self.extension.flow_to_model(flow) + model.fit(X, y) + self.assertEqual(type(model), type(clf)) + self.assertNotEqual(model, clf) + self.assertEqual(len(model.named_steps), 3) + self.assertEqual(model.named_steps['dummystep'], 'passthrough') + + def test_sklearn_serialization_with_none_step(self): + msg = 'Cannot serialize objects of None type. Please use a valid ' \ + 'placeholder for None. Note that empty sklearn estimators can be ' \ + 'replaced with \'drop\' or \'passthrough\'.' + clf = sklearn.pipeline.Pipeline([('dummystep', None), + ('classifier', sklearn.svm.SVC(gamma='auto'))]) + with self.assertRaisesRegex(ValueError, msg): + self.extension.model_to_flow(clf)