From 78b163e8a3f7fe2c977a26912f7a7a4ae2742f2a Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 6 Nov 2019 17:15:17 +0100
Subject: [PATCH 1/7] Initial changes to handle reproducible example from the
 issue

---
 openml/extensions/sklearn/extension.py | 31 ++++++++++++++++++++------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index cc3352a20..1ccb9683b 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -694,10 +694,14 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         # will be part of the name (in brackets)
         sub_components_names = ""
         for key in subcomponents:
+            if isinstance(subcomponents[key], str):
+                name = subcomponents[key]
+            else:
+                name = subcomponents[key].name
             if key in subcomponents_explicit:
-                sub_components_names += "," + key + "=" + subcomponents[key].name
+                sub_components_names += "," + key + "=" + name
             else:
-                sub_components_names += "," + subcomponents[key].name
+                sub_components_names += "," + name
 
         if sub_components_names:
             # slice operation on string in order to get rid of leading comma
@@ -769,6 +773,8 @@ def _get_external_version_string(
         external_versions.add(openml_version)
         external_versions.add(sklearn_version)
         for visitee in sub_components.values():
+            if isinstance(visitee, str):
+                continue
             for external_version in visitee.external_version.split(','):
                 external_versions.add(external_version)
         return ','.join(list(sorted(external_versions)))
@@ -776,14 +782,16 @@ def _get_external_version_string(
     def _check_multiple_occurence_of_component_in_flow(
         self,
         model: Any,
-        sub_components: Dict[str, OpenMLFlow],
+        sub_components: Dict[str, Any],
     ) -> None:
         to_visit_stack = []  # type: List[OpenMLFlow]
         to_visit_stack.extend(sub_components.values())
         known_sub_components = set()  # type: Set[str]
         while len(to_visit_stack) > 0:
             visitee = to_visit_stack.pop()
-            if visitee.name in known_sub_components:
+            if isinstance(visitee, str):
+                known_sub_components.add(visitee)
+            elif visitee.name in known_sub_components:
                 raise ValueError('Found a second occurence of component %s when '
                                  'trying to serialize %s.' % (visitee.name, model))
             else:
@@ -796,7 +804,7 @@ def _extract_information_from_model(
     ) -> Tuple[
         'OrderedDict[str, Optional[str]]',
         'OrderedDict[str, Optional[Dict]]',
-        'OrderedDict[str, OpenMLFlow]',
+        'OrderedDict[str, Any]',
         Set,
     ]:
         # This function contains four "global" states and is quite long and
@@ -820,7 +828,7 @@ def _extract_information_from_model(
             def flatten_all(list_):
                 """ Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """
                 for el in list_:
-                    if isinstance(el, (list, tuple)):
+                    if isinstance(el, (list, tuple)) and len(el) > 0:
                         yield from flatten_all(el)
                     else:
                         yield el
@@ -860,7 +868,16 @@ def flatten_all(list_):
                         # length 3 is for ColumnTransformer
                         msg = 'Length of tuple does not match assumptions'
                         raise ValueError(msg)
-                    if not isinstance(sub_component, (OpenMLFlow, type(None))):
+
+                    if isinstance(sub_component, str):
+                        if sub_component != 'drop' and sub_component != 'passthrough':
+                            msg = 'Second item of tuple does not match assumptions. ' \
+                                  'If string, can be only \'drop\' or \'passthrough\' but' \
+                                  'got %s' % sub_component
+                            raise ValueError(msg)
+                        else:
+                            pass
+                    elif not isinstance(sub_component, (OpenMLFlow, type(None))):
                         msg = 'Second item of tuple does not match assumptions. ' \
                               'Expected OpenMLFlow, got %s' % type(sub_component)
                         raise TypeError(msg)

From 4875dbadf075aa6d8cc2c345bf4c4a3f11b426ea Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 12 Nov 2019 20:02:18 +0100
Subject: [PATCH 2/7] Making tentative changes; Need to test deserialization

---
 openml/extensions/sklearn/extension.py        | 32 ++++++-----
 .../test_sklearn_extension.py                 | 53 ++++++++++++++++++-
 2 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 1ccb9683b..c7fd4cc2b 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -694,10 +694,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         # will be part of the name (in brackets)
         sub_components_names = ""
         for key in subcomponents:
-            if isinstance(subcomponents[key], str):
-                name = subcomponents[key]
-            else:
+            if isinstance(subcomponents[key], OpenMLFlow):
                 name = subcomponents[key].name
+            elif isinstance(subcomponents[key], str):  # 'drop', 'passthrough' can be passed
+                name = subcomponents[key]
+            elif subcomponents[key] is None:
+                name = "None"
             if key in subcomponents_explicit:
                 sub_components_names += "," + key + "=" + name
             else:
@@ -752,7 +754,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
     def _get_external_version_string(
         self,
         model: Any,
-        sub_components: Dict[str, OpenMLFlow],
+        sub_components: Dict[str, Union[OpenMLFlow, str, None]],
     ) -> str:
         # Create external version string for a flow, given the model and the
         # already parsed dictionary of sub_components. Retrieves the external
@@ -773,7 +775,8 @@ def _get_external_version_string(
         external_versions.add(openml_version)
         external_versions.add(sklearn_version)
         for visitee in sub_components.values():
-            if isinstance(visitee, str):
+            # 'drop', 'passthrough', None can be passed as estimators
+            if isinstance(visitee, str) or visitee is None:
                 continue
             for external_version in visitee.external_version.split(','):
                 external_versions.add(external_version)
@@ -782,15 +785,18 @@ def _get_external_version_string(
     def _check_multiple_occurence_of_component_in_flow(
         self,
         model: Any,
-        sub_components: Dict[str, Any],
+        sub_components: Dict[str, Union[OpenMLFlow, str, None]],
     ) -> None:
-        to_visit_stack = []  # type: List[OpenMLFlow]
+        to_visit_stack = []  # type: List[Union[OpenMLFlow, str, None]]
         to_visit_stack.extend(sub_components.values())
         known_sub_components = set()  # type: Set[str]
+
         while len(to_visit_stack) > 0:
             visitee = to_visit_stack.pop()
-            if isinstance(visitee, str):
+            if isinstance(visitee, str):  # 'drop', 'passthrough' can be passed as estimators
                 known_sub_components.add(visitee)
+            elif visitee is None:  # a None step can be included in a Pipeline
+                known_sub_components.add(str(visitee))
             elif visitee.name in known_sub_components:
                 raise ValueError('Found a second occurence of component %s when '
                                  'trying to serialize %s.' % (visitee.name, model))
@@ -804,7 +810,7 @@ def _extract_information_from_model(
     ) -> Tuple[
         'OrderedDict[str, Optional[str]]',
         'OrderedDict[str, Optional[Dict]]',
-        'OrderedDict[str, Any]',
+        'OrderedDict[str, Union[OpenMLFlow, str, None]]',
         Set,
     ]:
         # This function contains four "global" states and is quite long and
@@ -814,7 +820,7 @@ def _extract_information_from_model(
         # separate class methods
 
         # stores all entities that should become subcomponents
-        sub_components = OrderedDict()  # type: OrderedDict[str, OpenMLFlow]
+        sub_components = OrderedDict()  # type: OrderedDict[str, Union[OpenMLFlow, str, None]]
         # stores the keys of all subcomponents that should become
         sub_components_explicit = set()
         parameters = OrderedDict()  # type: OrderedDict[str, Optional[str]]
@@ -858,7 +864,7 @@ def flatten_all(list_):
                 parameter_value = list()  # type: List
                 reserved_keywords = set(model.get_params(deep=False).keys())
 
-                for sub_component_tuple in rval:
+                for i, sub_component_tuple in enumerate(rval):
                     identifier = sub_component_tuple[0]
                     sub_component = sub_component_tuple[1]
                     sub_component_type = type(sub_component_tuple)
@@ -891,13 +897,15 @@ def flatten_all(list_):
                         raise PyOpenMLError(msg)
 
                     if sub_component is None:
-                        # In a FeatureUnion it is legal to have a None step
+                        # In a FeatureUnion, Pipeline it is legal to have a None step
 
                         pv = [identifier, None]
                         if sub_component_type is tuple:
                             parameter_value.append(tuple(pv))
                         else:
                             parameter_value.append(pv)
+                        sub_components_explicit.add(identifier)
+                        sub_components[identifier] = sub_component
 
                     else:
                         # Add the component to the list of components, add a
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index a93c79bcd..300f7ab5f 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -28,7 +28,10 @@
 import sklearn.preprocessing
 import sklearn.tree
 import sklearn.cluster
-
+from sklearn.pipeline import make_pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
 
 import openml
 from openml.extensions.sklearn import SklearnExtension
@@ -36,7 +39,7 @@
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer
+from openml.testing import TestBase
 
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -678,6 +681,7 @@ def test_serialize_feature_union(self):
         self.assertEqual(serialization.name,
                          'sklearn.pipeline.FeatureUnion('
                          'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
+                         'scaler=None'
                          .format(module_name_encoder))
         new_model = self.extension.flow_to_model(serialization)
         self.assertEqual(type(new_model), type(fu))
@@ -1776,3 +1780,48 @@ def test_trim_flow_name(self):
 
         self.assertEqual("weka.IsolationForest",
                          SklearnExtension.trim_flow_name("weka.IsolationForest"))
+
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer available only after 0.19")
+    def test_run_on_model_with_empty_steps(self):
+        # testing 'drop', 'passthrough', None as non-actionable sklearn estimators
+        dataset = openml.datasets.get_dataset(128)
+        task = openml.tasks.get_task(59)
+
+        X, y, categorical_ind, feature_names = dataset.get_data(
+            target=dataset.default_target_attribute, dataset_format='array')
+        categorical_ind = np.array(categorical_ind)
+        cat_idx, = np.where(categorical_ind)
+        cont_idx, = np.where(~categorical_ind)
+
+        clf = make_pipeline(
+            ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                                     OneHotEncoder()), cat_idx.tolist()),
+                               ('cont', make_pipeline(SimpleImputer(strategy='median'),
+                                                      StandardScaler()), cont_idx.tolist())])
+        )
+
+        clf = sklearn.pipeline.Pipeline([
+            ('dummystep', 'passthrough'),  # adding 'passthrough' as an estimator
+            ('prep', clf),
+            ('variancethreshold', None),  # adding 'None' as an estimator
+            ('classifier', sklearn.svm.SVC(gamma='auto'))
+        ])
+
+        # adding 'drop' to a ColumnTransformer
+        if not categorical_ind.any():
+            clf[1][0].set_params(cat='drop')
+        if not (~categorical_ind).any():
+            clf[1][0].set_params(cont='drop')
+
+        run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
+
+        self.assertEqual(len(flow.components), 4)
+        self.assertEqual(flow.components['dummystep'], 'passthrough')
+        self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow))
+        self.assertEqual(flow.components['variancethreshold'], None)
+        self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow))
+        self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'],
+                        OpenMLFlow))
+        self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'],
+                         'drop')

From 0234d199baa4ed9feff5d853a91485ccbecad2ca Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Tue, 12 Nov 2019 20:59:33 +0100
Subject: [PATCH 3/7] Fixing deserialization when empty steps in sklearn model

---
 openml/extensions/sklearn/extension.py        | 41 ++++++-------------
 .../test_sklearn_extension.py                 | 13 +++++-
 2 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index c7fd4cc2b..fd210e8d2 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -867,7 +867,7 @@ def flatten_all(list_):
                 for i, sub_component_tuple in enumerate(rval):
                     identifier = sub_component_tuple[0]
                     sub_component = sub_component_tuple[1]
-                    sub_component_type = type(sub_component_tuple)
+                    # sub_component_type = type(sub_component_tuple)
                     if not 2 <= len(sub_component_tuple) <= 3:
                         # length 2 is for {VotingClassifier.estimators,
                         # Pipeline.steps, FeatureUnion.transformer_list}
@@ -896,33 +896,18 @@ def flatten_all(list_):
                                                         identifier)
                         raise PyOpenMLError(msg)
 
-                    if sub_component is None:
-                        # In a FeatureUnion, Pipeline it is legal to have a None step
-
-                        pv = [identifier, None]
-                        if sub_component_type is tuple:
-                            parameter_value.append(tuple(pv))
-                        else:
-                            parameter_value.append(pv)
-                        sub_components_explicit.add(identifier)
-                        sub_components[identifier] = sub_component
-
-                    else:
-                        # Add the component to the list of components, add a
-                        # component reference as a placeholder to the list of
-                        # parameters, which will be replaced by the real component
-                        # when deserializing the parameter
-                        sub_components_explicit.add(identifier)
-                        sub_components[identifier] = sub_component
-                        component_reference = OrderedDict()  # type: Dict[str, Union[str, Dict]]
-                        component_reference['oml-python:serialized_object'] = 'component_reference'
-                        cr_value = OrderedDict()  # type: Dict[str, Any]
-                        cr_value['key'] = identifier
-                        cr_value['step_name'] = identifier
-                        if len(sub_component_tuple) == 3:
-                            cr_value['argument_1'] = sub_component_tuple[2]
-                        component_reference['value'] = cr_value
-                        parameter_value.append(component_reference)
+                    # when deserializing the parameter
+                    sub_components_explicit.add(identifier)
+                    sub_components[identifier] = sub_component
+                    component_reference = OrderedDict()  # type: Dict[str, Union[str, Dict]]
+                    component_reference['oml-python:serialized_object'] = 'component_reference'
+                    cr_value = OrderedDict()  # type: Dict[str, Any]
+                    cr_value['key'] = identifier
+                    cr_value['step_name'] = identifier
+                    if len(sub_component_tuple) == 3:
+                        cr_value['argument_1'] = sub_component_tuple[2]
+                    component_reference['value'] = cr_value
+                    parameter_value.append(component_reference)
 
                 # Here (and in the elif and else branch below) are the only
                 # places where we encode a value as json to make sure that all
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 300f7ab5f..b1aa3529f 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -680,8 +680,8 @@ def test_serialize_feature_union(self):
         serialization = self.extension.model_to_flow(fu)
         self.assertEqual(serialization.name,
                          'sklearn.pipeline.FeatureUnion('
-                         'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
-                         'scaler=None'
+                         'ohe=sklearn.preprocessing.{}.OneHotEncoder,'
+                         'scaler=None)'
                          .format(module_name_encoder))
         new_model = self.extension.flow_to_model(serialization)
         self.assertEqual(type(new_model), type(fu))
@@ -1814,6 +1814,7 @@ def test_run_on_model_with_empty_steps(self):
         if not (~categorical_ind).any():
             clf[1][0].set_params(cont='drop')
 
+        # serializing model with non-actionable step
         run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
 
         self.assertEqual(len(flow.components), 4)
@@ -1825,3 +1826,11 @@ def test_run_on_model_with_empty_steps(self):
                         OpenMLFlow))
         self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'],
                          'drop')
+
+        # de-serializing flow to a model with non-actionable step
+        model = self.extension.flow_to_model(flow)
+
+        self.assertEqual(type(model), type(clf))
+        self.assertNotEqual(model, clf)
+        self.assertEqual(len(model.named_steps), 4)
+        self.assertEqual(model.named_steps['variancethreshold'], None)

From cbf0d6a749a4f23245f15668a548f235c61b82f6 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 13 Nov 2019 14:43:32 +0100
Subject: [PATCH 4/7] Fixing flake issues, failing test cases

---
 openml/extensions/sklearn/extension.py                 | 10 +++++-----
 .../test_sklearn_extension/test_sklearn_extension.py   |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index fd210e8d2..d5d5206eb 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -754,7 +754,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
     def _get_external_version_string(
         self,
         model: Any,
-        sub_components: Dict[str, Union[OpenMLFlow, str, None]],
+        sub_components: Dict[str, OpenMLFlow],
     ) -> str:
         # Create external version string for a flow, given the model and the
         # already parsed dictionary of sub_components. Retrieves the external
@@ -785,9 +785,9 @@ def _get_external_version_string(
     def _check_multiple_occurence_of_component_in_flow(
         self,
         model: Any,
-        sub_components: Dict[str, Union[OpenMLFlow, str, None]],
+        sub_components: Dict[str, OpenMLFlow],
     ) -> None:
-        to_visit_stack = []  # type: List[Union[OpenMLFlow, str, None]]
+        to_visit_stack = []  # type: List[OpenMLFlow]
         to_visit_stack.extend(sub_components.values())
         known_sub_components = set()  # type: Set[str]
 
@@ -810,7 +810,7 @@ def _extract_information_from_model(
     ) -> Tuple[
         'OrderedDict[str, Optional[str]]',
         'OrderedDict[str, Optional[Dict]]',
-        'OrderedDict[str, Union[OpenMLFlow, str, None]]',
+        'OrderedDict[str, OpenMLFlow]',
         Set,
     ]:
         # This function contains four "global" states and is quite long and
@@ -820,7 +820,7 @@ def _extract_information_from_model(
         # separate class methods
 
         # stores all entities that should become subcomponents
-        sub_components = OrderedDict()  # type: OrderedDict[str, Union[OpenMLFlow, str, None]]
+        sub_components = OrderedDict()  # type: OrderedDict[str, OpenMLFlow]
         # stores the keys of all subcomponents that should become
         sub_components_explicit = set()
         parameters = OrderedDict()  # type: OrderedDict[str, Optional[str]]
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index b1aa3529f..8c731f43d 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -28,10 +28,9 @@
 import sklearn.preprocessing
 import sklearn.tree
 import sklearn.cluster
+from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
-from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
-from sklearn.impute import SimpleImputer
 
 import openml
 from openml.extensions.sklearn import SklearnExtension
@@ -1782,8 +1781,9 @@ def test_trim_flow_name(self):
                          SklearnExtension.trim_flow_name("weka.IsolationForest"))
 
     @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
-                     reason="SimpleImputer available only after 0.19")
+                     reason="SimpleImputer, ColumnTransformer available only after 0.19")
     def test_run_on_model_with_empty_steps(self):
+        from sklearn.compose import ColumnTransformer
         # testing 'drop', 'passthrough', None as non-actionable sklearn estimators
         dataset = openml.datasets.get_dataset(128)
         task = openml.tasks.get_task(59)
@@ -1829,7 +1829,7 @@ def test_run_on_model_with_empty_steps(self):
 
         # de-serializing flow to a model with non-actionable step
         model = self.extension.flow_to_model(flow)
-
+        model.fit(X, y)
         self.assertEqual(type(model), type(clf))
         self.assertNotEqual(model, clf)
         self.assertEqual(len(model.named_steps), 4)

From 7768ca3c2d87549fbb10503f9bb3fd6a557a49d6 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Wed, 13 Nov 2019 15:28:36 +0100
Subject: [PATCH 5/7] Fixing test cases

---
 .../test_sklearn_extension/test_sklearn_extension.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 8c731f43d..1b59da9c0 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -28,7 +28,6 @@
 import sklearn.preprocessing
 import sklearn.tree
 import sklearn.cluster
-from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
@@ -38,7 +37,7 @@
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase
+from openml.testing import TestBase, SimpleImputer
 
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -1780,8 +1779,9 @@ def test_trim_flow_name(self):
         self.assertEqual("weka.IsolationForest",
                          SklearnExtension.trim_flow_name("weka.IsolationForest"))
 
-    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
-                     reason="SimpleImputer, ColumnTransformer available only after 0.19")
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21",
+                     reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
+                            "Pipeline till 0.20 doesn't support indexing and 'passthrough'")
     def test_run_on_model_with_empty_steps(self):
         from sklearn.compose import ColumnTransformer
         # testing 'drop', 'passthrough', None as non-actionable sklearn estimators

From 35d3c3498b3232a07c4d99baa6da08b4b677af79 Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Fri, 15 Nov 2019 14:47:16 +0100
Subject: [PATCH 6/7] Dropping support for 'None' as sklearn estimator

---
 openml/extensions/sklearn/extension.py             | 13 +++++++------
 .../test_sklearn_extension.py                      | 14 ++++++--------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index d5d5206eb..4cb706f02 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -698,8 +698,6 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
                 name = subcomponents[key].name
             elif isinstance(subcomponents[key], str):  # 'drop', 'passthrough' can be passed
                 name = subcomponents[key]
-            elif subcomponents[key] is None:
-                name = "None"
             if key in subcomponents_explicit:
                 sub_components_names += "," + key + "=" + name
             else:
@@ -776,7 +774,7 @@ def _get_external_version_string(
         external_versions.add(sklearn_version)
         for visitee in sub_components.values():
             # 'drop', 'passthrough', None can be passed as estimators
-            if isinstance(visitee, str) or visitee is None:
+            if isinstance(visitee, str):
                 continue
             for external_version in visitee.external_version.split(','):
                 external_versions.add(external_version)
@@ -795,8 +793,6 @@ def _check_multiple_occurence_of_component_in_flow(
             visitee = to_visit_stack.pop()
             if isinstance(visitee, str):  # 'drop', 'passthrough' can be passed as estimators
                 known_sub_components.add(visitee)
-            elif visitee is None:  # a None step can be included in a Pipeline
-                known_sub_components.add(str(visitee))
             elif visitee.name in known_sub_components:
                 raise ValueError('Found a second occurence of component %s when '
                                  'trying to serialize %s.' % (visitee.name, model))
@@ -883,7 +879,12 @@ def flatten_all(list_):
                             raise ValueError(msg)
                         else:
                             pass
-                    elif not isinstance(sub_component, (OpenMLFlow, type(None))):
+                    elif isinstance(sub_component, type(None)):
+                        msg = 'Cannot serialize objects of None type. Please replace with a '\
+                              'relevant placeholder. Note that empty sklearn estimators can be '\
+                              'replaced with \'drop\' or \'passthrough\'.'
+                        raise ValueError(msg)
+                    elif not isinstance(sub_component, OpenMLFlow):
                         msg = 'Second item of tuple does not match assumptions. ' \
                               'Expected OpenMLFlow, got %s' % type(sub_component)
                         raise TypeError(msg)
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 1b59da9c0..ffba201f0 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -674,17 +674,17 @@ def test_serialize_feature_union(self):
         self.assertEqual(new_model_params, fu_params)
         new_model.fit(self.X, self.y)
 
-        fu.set_params(scaler=None)
+        fu.set_params(scaler='drop')
         serialization = self.extension.model_to_flow(fu)
         self.assertEqual(serialization.name,
                          'sklearn.pipeline.FeatureUnion('
                          'ohe=sklearn.preprocessing.{}.OneHotEncoder,'
-                         'scaler=None)'
+                         'scaler=drop)'
                          .format(module_name_encoder))
         new_model = self.extension.flow_to_model(serialization)
         self.assertEqual(type(new_model), type(fu))
         self.assertIsNot(new_model, fu)
-        self.assertIs(new_model.transformer_list[1][1], None)
+        self.assertIs(new_model.transformer_list[1][1], 'drop')
 
     def test_serialize_feature_union_switched_names(self):
         ohe_params = ({'categories': 'auto'}
@@ -1804,7 +1804,6 @@ def test_run_on_model_with_empty_steps(self):
         clf = sklearn.pipeline.Pipeline([
             ('dummystep', 'passthrough'),  # adding 'passthrough' as an estimator
             ('prep', clf),
-            ('variancethreshold', None),  # adding 'None' as an estimator
             ('classifier', sklearn.svm.SVC(gamma='auto'))
         ])
 
@@ -1817,10 +1816,9 @@ def test_run_on_model_with_empty_steps(self):
         # serializing model with non-actionable step
         run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
 
-        self.assertEqual(len(flow.components), 4)
+        self.assertEqual(len(flow.components), 3)
         self.assertEqual(flow.components['dummystep'], 'passthrough')
         self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow))
-        self.assertEqual(flow.components['variancethreshold'], None)
         self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow))
         self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'],
                         OpenMLFlow))
@@ -1832,5 +1830,5 @@ def test_run_on_model_with_empty_steps(self):
         model.fit(X, y)
         self.assertEqual(type(model), type(clf))
         self.assertNotEqual(model, clf)
-        self.assertEqual(len(model.named_steps), 4)
-        self.assertEqual(model.named_steps['variancethreshold'], None)
+        self.assertEqual(len(model.named_steps), 3)
+        self.assertEqual(model.named_steps['dummystep'], 'passthrough')

From 3b03d151815f70c71c0a6ebab8218b35a9f61d7e Mon Sep 17 00:00:00 2001
From: neeratyoy <neeratyoy@gmail.com>
Date: Mon, 18 Nov 2019 15:22:16 +0100
Subject: [PATCH 7/7] Adding test case for None estimator

---
 openml/extensions/sklearn/extension.py                |  4 ++--
 .../test_sklearn_extension/test_sklearn_extension.py  | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 4cb706f02..f1a25cbbe 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -880,8 +880,8 @@ def flatten_all(list_):
                         else:
                             pass
                     elif isinstance(sub_component, type(None)):
-                        msg = 'Cannot serialize objects of None type. Please replace with a '\
-                              'relevant placeholder. Note that empty sklearn estimators can be '\
+                        msg = 'Cannot serialize objects of None type. Please use a valid ' \
+                              'placeholder for None. Note that empty sklearn estimators can be '\
                               'replaced with \'drop\' or \'passthrough\'.'
                         raise ValueError(msg)
                     elif not isinstance(sub_component, OpenMLFlow):
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index ffba201f0..3ab9d8936 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -608,6 +608,8 @@ def test_serialize_column_transformer_pipeline(self):
         serialization2 = self.extension.model_to_flow(new_model)
         assert_flows_equal(serialization, serialization2)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="Pipeline processing behaviour updated")
     def test_serialize_feature_union(self):
         ohe_params = {'sparse': False}
         if LooseVersion(sklearn.__version__) >= "0.20":
@@ -1832,3 +1834,12 @@ def test_run_on_model_with_empty_steps(self):
         self.assertNotEqual(model, clf)
         self.assertEqual(len(model.named_steps), 3)
         self.assertEqual(model.named_steps['dummystep'], 'passthrough')
+
+    def test_sklearn_serialization_with_none_step(self):
+        msg = 'Cannot serialize objects of None type. Please use a valid ' \
+              'placeholder for None. Note that empty sklearn estimators can be ' \
+              'replaced with \'drop\' or \'passthrough\'.'
+        clf = sklearn.pipeline.Pipeline([('dummystep', None),
+                                         ('classifier', sklearn.svm.SVC(gamma='auto'))])
+        with self.assertRaisesRegex(ValueError, msg):
+            self.extension.model_to_flow(clf)