From 78b163e8a3f7fe2c977a26912f7a7a4ae2742f2a Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Wed, 6 Nov 2019 17:15:17 +0100 Subject: [PATCH 1/7] Initial changes to handle reproducible example from the issue --- openml/extensions/sklearn/extension.py | 31 ++++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index cc3352a20..1ccb9683b 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -694,10 +694,14 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: # will be part of the name (in brackets) sub_components_names = "" for key in subcomponents: + if isinstance(subcomponents[key], str): + name = subcomponents[key] + else: + name = subcomponents[key].name if key in subcomponents_explicit: - sub_components_names += "," + key + "=" + subcomponents[key].name + sub_components_names += "," + key + "=" + name else: - sub_components_names += "," + subcomponents[key].name + sub_components_names += "," + name if sub_components_names: # slice operation on string in order to get rid of leading comma @@ -769,6 +773,8 @@ def _get_external_version_string( external_versions.add(openml_version) external_versions.add(sklearn_version) for visitee in sub_components.values(): + if isinstance(visitee, str): + continue for external_version in visitee.external_version.split(','): external_versions.add(external_version) return ','.join(list(sorted(external_versions))) @@ -776,14 +782,16 @@ def _get_external_version_string( def _check_multiple_occurence_of_component_in_flow( self, model: Any, - sub_components: Dict[str, OpenMLFlow], + sub_components: Dict[str, Any], ) -> None: to_visit_stack = [] # type: List[OpenMLFlow] to_visit_stack.extend(sub_components.values()) known_sub_components = set() # type: Set[str] while len(to_visit_stack) > 0: visitee = to_visit_stack.pop() - if visitee.name in known_sub_components: + if isinstance(visitee, str): + known_sub_components.add(visitee) + elif visitee.name in known_sub_components: raise ValueError('Found a second occurence of component %s when ' 'trying to serialize %s.' % (visitee.name, model)) else: @@ -796,7 +804,7 @@ def _extract_information_from_model( ) -> Tuple[ 'OrderedDict[str, Optional[str]]', 'OrderedDict[str, Optional[Dict]]', - 'OrderedDict[str, OpenMLFlow]', + 'OrderedDict[str, Any]', Set, ]: # This function contains four "global" states and is quite long and @@ -820,7 +828,7 @@ def _extract_information_from_model( def flatten_all(list_): """ Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """ for el in list_: - if isinstance(el, (list, tuple)): + if isinstance(el, (list, tuple)) and len(el) > 0: yield from flatten_all(el) else: yield el @@ -860,7 +868,16 @@ def flatten_all(list_): # length 3 is for ColumnTransformer msg = 'Length of tuple does not match assumptions' raise ValueError(msg) - if not isinstance(sub_component, (OpenMLFlow, type(None))): + + if isinstance(sub_component, str): + if sub_component != 'drop' and sub_component != 'passthrough': + msg = 'Second item of tuple does not match assumptions. ' \ + 'If string, can be only \'drop\' or \'passthrough\' but' \ + 'got %s' % sub_component + raise ValueError(msg) + else: + pass + elif not isinstance(sub_component, (OpenMLFlow, type(None))): msg = 'Second item of tuple does not match assumptions. ' \ 'Expected OpenMLFlow, got %s' % type(sub_component) raise TypeError(msg) From 4875dbadf075aa6d8cc2c345bf4c4a3f11b426ea Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Tue, 12 Nov 2019 20:02:18 +0100 Subject: [PATCH 2/7] Making tentative changes; Need to test deserialization --- openml/extensions/sklearn/extension.py | 32 ++++++----- .../test_sklearn_extension.py | 53 ++++++++++++++++++- 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 1ccb9683b..c7fd4cc2b 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -694,10 +694,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: # will be part of the name (in brackets) sub_components_names = "" for key in subcomponents: - if isinstance(subcomponents[key], str): - name = subcomponents[key] - else: + if isinstance(subcomponents[key], OpenMLFlow): name = subcomponents[key].name + elif isinstance(subcomponents[key], str): # 'drop', 'passthrough' can be passed + name = subcomponents[key] + elif subcomponents[key] is None: + name = "None" if key in subcomponents_explicit: sub_components_names += "," + key + "=" + name else: @@ -752,7 +754,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: def _get_external_version_string( self, model: Any, - sub_components: Dict[str, OpenMLFlow], + sub_components: Dict[str, Union[OpenMLFlow, str, None]], ) -> str: # Create external version string for a flow, given the model and the # already parsed dictionary of sub_components. Retrieves the external @@ -773,7 +775,8 @@ def _get_external_version_string( external_versions.add(openml_version) external_versions.add(sklearn_version) for visitee in sub_components.values(): - if isinstance(visitee, str): + # 'drop', 'passthrough', None can be passed as estimators + if isinstance(visitee, str) or visitee is None: continue for external_version in visitee.external_version.split(','): external_versions.add(external_version) @@ -782,15 +785,18 @@ def _get_external_version_string( def _check_multiple_occurence_of_component_in_flow( self, model: Any, - sub_components: Dict[str, Any], + sub_components: Dict[str, Union[OpenMLFlow, str, None]], ) -> None: - to_visit_stack = [] # type: List[OpenMLFlow] + to_visit_stack = [] # type: List[Union[OpenMLFlow, str, None]] to_visit_stack.extend(sub_components.values()) known_sub_components = set() # type: Set[str] + while len(to_visit_stack) > 0: visitee = to_visit_stack.pop() - if isinstance(visitee, str): + if isinstance(visitee, str): # 'drop', 'passthrough' can be passed as estimators known_sub_components.add(visitee) + elif visitee is None: # a None step can be included in a Pipeline + known_sub_components.add(str(visitee)) elif visitee.name in known_sub_components: raise ValueError('Found a second occurence of component %s when ' 'trying to serialize %s.' % (visitee.name, model)) @@ -804,7 +810,7 @@ def _extract_information_from_model( ) -> Tuple[ 'OrderedDict[str, Optional[str]]', 'OrderedDict[str, Optional[Dict]]', - 'OrderedDict[str, Any]', + 'OrderedDict[str, Union[OpenMLFlow, str, None]]', Set, ]: # This function contains four "global" states and is quite long and @@ -814,7 +820,7 @@ def _extract_information_from_model( # separate class methods # stores all entities that should become subcomponents - sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow] + sub_components = OrderedDict() # type: OrderedDict[str, Union[OpenMLFlow, str, None]] # stores the keys of all subcomponents that should become sub_components_explicit = set() parameters = OrderedDict() # type: OrderedDict[str, Optional[str]] @@ -858,7 +864,7 @@ def flatten_all(list_): parameter_value = list() # type: List reserved_keywords = set(model.get_params(deep=False).keys()) - for sub_component_tuple in rval: + for i, sub_component_tuple in enumerate(rval): identifier = sub_component_tuple[0] sub_component = sub_component_tuple[1] sub_component_type = type(sub_component_tuple) @@ -891,13 +897,15 @@ def flatten_all(list_): raise PyOpenMLError(msg) if sub_component is None: - # In a FeatureUnion it is legal to have a None step + # In a FeatureUnion, Pipeline it is legal to have a None step pv = [identifier, None] if sub_component_type is tuple: parameter_value.append(tuple(pv)) else: parameter_value.append(pv) + sub_components_explicit.add(identifier) + sub_components[identifier] = sub_component else: # Add the component to the list of components, add a diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index a93c79bcd..300f7ab5f 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -28,7 +28,10 @@ import sklearn.preprocessing import sklearn.tree import sklearn.cluster - +from sklearn.pipeline import make_pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.impute import SimpleImputer import openml from openml.extensions.sklearn import SklearnExtension @@ -36,7 +39,7 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase, SimpleImputer +from openml.testing import TestBase this_directory = os.path.dirname(os.path.abspath(__file__)) @@ -678,6 +681,7 @@ def test_serialize_feature_union(self): self.assertEqual(serialization.name, 'sklearn.pipeline.FeatureUnion(' 'ohe=sklearn.preprocessing.{}.OneHotEncoder)' + 'scaler=None' .format(module_name_encoder)) new_model = self.extension.flow_to_model(serialization) self.assertEqual(type(new_model), type(fu)) @@ -1776,3 +1780,48 @@ def test_trim_flow_name(self): self.assertEqual("weka.IsolationForest", SklearnExtension.trim_flow_name("weka.IsolationForest")) + + @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20", + reason="SimpleImputer available only after 0.19") + def test_run_on_model_with_empty_steps(self): + # testing 'drop', 'passthrough', None as non-actionable sklearn estimators + dataset = openml.datasets.get_dataset(128) + task = openml.tasks.get_task(59) + + X, y, categorical_ind, feature_names = dataset.get_data( + target=dataset.default_target_attribute, dataset_format='array') + categorical_ind = np.array(categorical_ind) + cat_idx, = np.where(categorical_ind) + cont_idx, = np.where(~categorical_ind) + + clf = make_pipeline( + ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'), + OneHotEncoder()), cat_idx.tolist()), + ('cont', make_pipeline(SimpleImputer(strategy='median'), + StandardScaler()), cont_idx.tolist())]) + ) + + clf = sklearn.pipeline.Pipeline([ + ('dummystep', 'passthrough'), # adding 'passthrough' as an estimator + ('prep', clf), + ('variancethreshold', None), # adding 'None' as an estimator + ('classifier', sklearn.svm.SVC(gamma='auto')) + ]) + + # adding 'drop' to a ColumnTransformer + if not categorical_ind.any(): + clf[1][0].set_params(cat='drop') + if not (~categorical_ind).any(): + clf[1][0].set_params(cont='drop') + + run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) + + self.assertEqual(len(flow.components), 4) + self.assertEqual(flow.components['dummystep'], 'passthrough') + self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow)) + self.assertEqual(flow.components['variancethreshold'], None) + self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow)) + self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'], + OpenMLFlow)) + self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'], + 'drop') From 0234d199baa4ed9feff5d853a91485ccbecad2ca Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Tue, 12 Nov 2019 20:59:33 +0100 Subject: [PATCH 3/7] Fixing deserialization when empty steps in sklearn model --- openml/extensions/sklearn/extension.py | 41 ++++++------------- .../test_sklearn_extension.py | 13 +++++- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index c7fd4cc2b..fd210e8d2 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -867,7 +867,7 @@ def flatten_all(list_): for i, sub_component_tuple in enumerate(rval): identifier = sub_component_tuple[0] sub_component = sub_component_tuple[1] - sub_component_type = type(sub_component_tuple) + # sub_component_type = type(sub_component_tuple) if not 2 <= len(sub_component_tuple) <= 3: # length 2 is for {VotingClassifier.estimators, # Pipeline.steps, FeatureUnion.transformer_list} @@ -896,33 +896,18 @@ def flatten_all(list_): identifier) raise PyOpenMLError(msg) - if sub_component is None: - # In a FeatureUnion, Pipeline it is legal to have a None step - - pv = [identifier, None] - if sub_component_type is tuple: - parameter_value.append(tuple(pv)) - else: - parameter_value.append(pv) - sub_components_explicit.add(identifier) - sub_components[identifier] = sub_component - - else: - # Add the component to the list of components, add a - # component reference as a placeholder to the list of - # parameters, which will be replaced by the real component - # when deserializing the parameter - sub_components_explicit.add(identifier) - sub_components[identifier] = sub_component - component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] - component_reference['oml-python:serialized_object'] = 'component_reference' - cr_value = OrderedDict() # type: Dict[str, Any] - cr_value['key'] = identifier - cr_value['step_name'] = identifier - if len(sub_component_tuple) == 3: - cr_value['argument_1'] = sub_component_tuple[2] - component_reference['value'] = cr_value - parameter_value.append(component_reference) + # when deserializing the parameter + sub_components_explicit.add(identifier) + sub_components[identifier] = sub_component + component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] + component_reference['oml-python:serialized_object'] = 'component_reference' + cr_value = OrderedDict() # type: Dict[str, Any] + cr_value['key'] = identifier + cr_value['step_name'] = identifier + if len(sub_component_tuple) == 3: + cr_value['argument_1'] = sub_component_tuple[2] + component_reference['value'] = cr_value + parameter_value.append(component_reference) # Here (and in the elif and else branch below) are the only # places where we encode a value as json to make sure that all diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 300f7ab5f..b1aa3529f 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -680,8 +680,8 @@ def test_serialize_feature_union(self): serialization = self.extension.model_to_flow(fu) self.assertEqual(serialization.name, 'sklearn.pipeline.FeatureUnion(' - 'ohe=sklearn.preprocessing.{}.OneHotEncoder)' - 'scaler=None' + 'ohe=sklearn.preprocessing.{}.OneHotEncoder,' + 'scaler=None)' .format(module_name_encoder)) new_model = self.extension.flow_to_model(serialization) self.assertEqual(type(new_model), type(fu)) @@ -1814,6 +1814,7 @@ def test_run_on_model_with_empty_steps(self): if not (~categorical_ind).any(): clf[1][0].set_params(cont='drop') + # serializing model with non-actionable step run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) self.assertEqual(len(flow.components), 4) @@ -1825,3 +1826,11 @@ def test_run_on_model_with_empty_steps(self): OpenMLFlow)) self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'], 'drop') + + # de-serializing flow to a model with non-actionable step + model = self.extension.flow_to_model(flow) + + self.assertEqual(type(model), type(clf)) + self.assertNotEqual(model, clf) + self.assertEqual(len(model.named_steps), 4) + self.assertEqual(model.named_steps['variancethreshold'], None) From cbf0d6a749a4f23245f15668a548f235c61b82f6 Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Wed, 13 Nov 2019 14:43:32 +0100 Subject: [PATCH 4/7] Fixing flake issues, failing test cases --- openml/extensions/sklearn/extension.py | 10 +++++----- .../test_sklearn_extension/test_sklearn_extension.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index fd210e8d2..d5d5206eb 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -754,7 +754,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: def _get_external_version_string( self, model: Any, - sub_components: Dict[str, Union[OpenMLFlow, str, None]], + sub_components: Dict[str, OpenMLFlow], ) -> str: # Create external version string for a flow, given the model and the # already parsed dictionary of sub_components. Retrieves the external @@ -785,9 +785,9 @@ def _get_external_version_string( def _check_multiple_occurence_of_component_in_flow( self, model: Any, - sub_components: Dict[str, Union[OpenMLFlow, str, None]], + sub_components: Dict[str, OpenMLFlow], ) -> None: - to_visit_stack = [] # type: List[Union[OpenMLFlow, str, None]] + to_visit_stack = [] # type: List[OpenMLFlow] to_visit_stack.extend(sub_components.values()) known_sub_components = set() # type: Set[str] @@ -810,7 +810,7 @@ def _extract_information_from_model( ) -> Tuple[ 'OrderedDict[str, Optional[str]]', 'OrderedDict[str, Optional[Dict]]', - 'OrderedDict[str, Union[OpenMLFlow, str, None]]', + 'OrderedDict[str, OpenMLFlow]', Set, ]: # This function contains four "global" states and is quite long and @@ -820,7 +820,7 @@ def _extract_information_from_model( # separate class methods # stores all entities that should become subcomponents - sub_components = OrderedDict() # type: OrderedDict[str, Union[OpenMLFlow, str, None]] + sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow] # stores the keys of all subcomponents that should become sub_components_explicit = set() parameters = OrderedDict() # type: OrderedDict[str, Optional[str]] diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index b1aa3529f..8c731f43d 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -28,10 +28,9 @@ import sklearn.preprocessing import sklearn.tree import sklearn.cluster +from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline -from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler -from sklearn.impute import SimpleImputer import openml from openml.extensions.sklearn import SklearnExtension @@ -1782,8 +1781,9 @@ def test_trim_flow_name(self): SklearnExtension.trim_flow_name("weka.IsolationForest")) @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20", - reason="SimpleImputer available only after 0.19") + reason="SimpleImputer, ColumnTransformer available only after 0.19") def test_run_on_model_with_empty_steps(self): + from sklearn.compose import ColumnTransformer # testing 'drop', 'passthrough', None as non-actionable sklearn estimators dataset = openml.datasets.get_dataset(128) task = openml.tasks.get_task(59) @@ -1829,7 +1829,7 @@ def test_run_on_model_with_empty_steps(self): # de-serializing flow to a model with non-actionable step model = self.extension.flow_to_model(flow) - + model.fit(X, y) self.assertEqual(type(model), type(clf)) self.assertNotEqual(model, clf) self.assertEqual(len(model.named_steps), 4) From 7768ca3c2d87549fbb10503f9bb3fd6a557a49d6 Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Wed, 13 Nov 2019 15:28:36 +0100 Subject: [PATCH 5/7] Fixing test cases --- .../test_sklearn_extension/test_sklearn_extension.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 8c731f43d..1b59da9c0 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -28,7 +28,6 @@ import sklearn.preprocessing import sklearn.tree import sklearn.cluster -from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler @@ -38,7 +37,7 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer this_directory = os.path.dirname(os.path.abspath(__file__)) @@ -1780,8 +1779,9 @@ def test_trim_flow_name(self): self.assertEqual("weka.IsolationForest", SklearnExtension.trim_flow_name("weka.IsolationForest")) - @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20", - reason="SimpleImputer, ColumnTransformer available only after 0.19") + @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21", + reason="SimpleImputer, ColumnTransformer available only after 0.19 and " + "Pipeline till 0.20 doesn't support indexing and 'passthrough'") def test_run_on_model_with_empty_steps(self): from sklearn.compose import ColumnTransformer # testing 'drop', 'passthrough', None as non-actionable sklearn estimators From 35d3c3498b3232a07c4d99baa6da08b4b677af79 Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Fri, 15 Nov 2019 14:47:16 +0100 Subject: [PATCH 6/7] Dropping support for 'None' as sklearn estimator --- openml/extensions/sklearn/extension.py | 13 +++++++------ .../test_sklearn_extension.py | 14 ++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index d5d5206eb..4cb706f02 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -698,8 +698,6 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: name = subcomponents[key].name elif isinstance(subcomponents[key], str): # 'drop', 'passthrough' can be passed name = subcomponents[key] - elif subcomponents[key] is None: - name = "None" if key in subcomponents_explicit: sub_components_names += "," + key + "=" + name else: @@ -776,7 +774,7 @@ def _get_external_version_string( external_versions.add(sklearn_version) for visitee in sub_components.values(): # 'drop', 'passthrough', None can be passed as estimators - if isinstance(visitee, str) or visitee is None: + if isinstance(visitee, str): continue for external_version in visitee.external_version.split(','): external_versions.add(external_version) @@ -795,8 +793,6 @@ def _check_multiple_occurence_of_component_in_flow( visitee = to_visit_stack.pop() if isinstance(visitee, str): # 'drop', 'passthrough' can be passed as estimators known_sub_components.add(visitee) - elif visitee is None: # a None step can be included in a Pipeline - known_sub_components.add(str(visitee)) elif visitee.name in known_sub_components: raise ValueError('Found a second occurence of component %s when ' 'trying to serialize %s.' % (visitee.name, model)) @@ -883,7 +879,12 @@ def flatten_all(list_): raise ValueError(msg) else: pass - elif not isinstance(sub_component, (OpenMLFlow, type(None))): + elif isinstance(sub_component, type(None)): + msg = 'Cannot serialize objects of None type. Please replace with a '\ + 'relevant placeholder. Note that empty sklearn estimators can be '\ + 'replaced with \'drop\' or \'passthrough\'.' + raise ValueError(msg) + elif not isinstance(sub_component, OpenMLFlow): msg = 'Second item of tuple does not match assumptions. ' \ 'Expected OpenMLFlow, got %s' % type(sub_component) raise TypeError(msg) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 1b59da9c0..ffba201f0 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -674,17 +674,17 @@ def test_serialize_feature_union(self): self.assertEqual(new_model_params, fu_params) new_model.fit(self.X, self.y) - fu.set_params(scaler=None) + fu.set_params(scaler='drop') serialization = self.extension.model_to_flow(fu) self.assertEqual(serialization.name, 'sklearn.pipeline.FeatureUnion(' 'ohe=sklearn.preprocessing.{}.OneHotEncoder,' - 'scaler=None)' + 'scaler=drop)' .format(module_name_encoder)) new_model = self.extension.flow_to_model(serialization) self.assertEqual(type(new_model), type(fu)) self.assertIsNot(new_model, fu) - self.assertIs(new_model.transformer_list[1][1], None) + self.assertIs(new_model.transformer_list[1][1], 'drop') def test_serialize_feature_union_switched_names(self): ohe_params = ({'categories': 'auto'} @@ -1804,7 +1804,6 @@ def test_run_on_model_with_empty_steps(self): clf = sklearn.pipeline.Pipeline([ ('dummystep', 'passthrough'), # adding 'passthrough' as an estimator ('prep', clf), - ('variancethreshold', None), # adding 'None' as an estimator ('classifier', sklearn.svm.SVC(gamma='auto')) ]) @@ -1817,10 +1816,9 @@ def test_run_on_model_with_empty_steps(self): # serializing model with non-actionable step run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) - self.assertEqual(len(flow.components), 4) + self.assertEqual(len(flow.components), 3) self.assertEqual(flow.components['dummystep'], 'passthrough') self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow)) - self.assertEqual(flow.components['variancethreshold'], None) self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow)) self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'], OpenMLFlow)) @@ -1832,5 +1830,5 @@ def test_run_on_model_with_empty_steps(self): model.fit(X, y) self.assertEqual(type(model), type(clf)) self.assertNotEqual(model, clf) - self.assertEqual(len(model.named_steps), 4) - self.assertEqual(model.named_steps['variancethreshold'], None) + self.assertEqual(len(model.named_steps), 3) + self.assertEqual(model.named_steps['dummystep'], 'passthrough') From 3b03d151815f70c71c0a6ebab8218b35a9f61d7e Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Mon, 18 Nov 2019 15:22:16 +0100 Subject: [PATCH 7/7] Adding test case for None estimator --- openml/extensions/sklearn/extension.py | 4 ++-- .../test_sklearn_extension/test_sklearn_extension.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 4cb706f02..f1a25cbbe 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -880,8 +880,8 @@ def flatten_all(list_): else: pass elif isinstance(sub_component, type(None)): - msg = 'Cannot serialize objects of None type. Please replace with a '\ - 'relevant placeholder. Note that empty sklearn estimators can be '\ + msg = 'Cannot serialize objects of None type. Please use a valid ' \ + 'placeholder for None. Note that empty sklearn estimators can be '\ 'replaced with \'drop\' or \'passthrough\'.' raise ValueError(msg) elif not isinstance(sub_component, OpenMLFlow): diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index ffba201f0..3ab9d8936 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -608,6 +608,8 @@ def test_serialize_column_transformer_pipeline(self): serialization2 = self.extension.model_to_flow(new_model) assert_flows_equal(serialization, serialization2) + @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20", + reason="Pipeline processing behaviour updated") def test_serialize_feature_union(self): ohe_params = {'sparse': False} if LooseVersion(sklearn.__version__) >= "0.20": @@ -1832,3 +1834,12 @@ def test_run_on_model_with_empty_steps(self): self.assertNotEqual(model, clf) self.assertEqual(len(model.named_steps), 3) self.assertEqual(model.named_steps['dummystep'], 'passthrough') + + def test_sklearn_serialization_with_none_step(self): + msg = 'Cannot serialize objects of None type. Please use a valid ' \ + 'placeholder for None. Note that empty sklearn estimators can be ' \ + 'replaced with \'drop\' or \'passthrough\'.' + clf = sklearn.pipeline.Pipeline([('dummystep', None), + ('classifier', sklearn.svm.SVC(gamma='auto'))]) + with self.assertRaisesRegex(ValueError, msg): + self.extension.model_to_flow(clf)