Improve unit tests (#985)

mfeurer · web-flow · commit a629562ed151 · 2020-11-03T08:35:52.000+01:00
* randomize test order

* reduce noise in the output to better see the issues

* deprecate format argument to OpenMLDataset

* fix file upload

* further reduce warnings

* fix test which failed due to deleting a dataset on the test server

* re-add test randomization (due to rebase)

* try if random test order causes all problems by removing it

* improve lbfgs test

* distribute tests better

* reduce randomness in lbfgs test

* add requested commits
diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml
@@ -51,7 +51,7 @@ jobs:
     - name: Run tests
       run: |
         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
-        pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv $codecov
+        pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv $codecov
     - name: Check for files left behind by test
       if: ${{ always() }}
       run: |
diff --git a/appveyor.yml b/appveyor.yml
@@ -45,4 +45,4 @@ build: false
 
 test_script:
   - "cd C:\\projects\\openml-python"
-  - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv"
+  - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread --dist load -sv"
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -13,7 +13,6 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse
-from warnings import warn
 
 from openml.base import OpenMLBase
 from .data_feature import OpenMLDataFeature
@@ -34,7 +33,7 @@ class OpenMLDataset(OpenMLBase):
         Name of the dataset.
     description : str
         Description of the dataset.
-    format : str
+    data_format : str
         Format of the dataset which can be either 'arff' or 'sparse_arff'.
     cache_format : str
         Format for caching the dataset which can be either 'feather' or 'pickle'.
@@ -103,7 +102,6 @@ def __init__(
         self,
         name,
         description,
-        format=None,
         data_format="arff",
         cache_format="pickle",
         dataset_id=None,
@@ -178,16 +176,8 @@ def find_invalid_characters(string, pattern):
             )
 
         self.cache_format = cache_format
-        if format is None:
-            self.format = data_format
-        else:
-            warn(
-                "The format parameter in the init will be deprecated "
-                "in the future."
-                "Please use data_format instead",
-                DeprecationWarning,
-            )
-            self.format = format
+        # Has to be called format, otherwise there will be an XML upload error
+        self.format = data_format
         self.creator = creator
         self.contributor = contributor
         self.collection_date = collection_date
@@ -456,12 +446,11 @@ def _parse_data_from_arff(
                     col.append(
                         self._unpack_categories(X[column_name], categories_names[column_name])
                     )
-                elif attribute_dtype[column_name] in ('floating',
-                                                      'integer'):
+                elif attribute_dtype[column_name] in ("floating", "integer"):
                     X_col = X[column_name]
                     if X_col.min() >= 0 and X_col.max() <= 255:
                         try:
-                            X_col_uint = X_col.astype('uint8')
+                            X_col_uint = X_col.astype("uint8")
                             if (X_col == X_col_uint).all():
                                 col.append(X_col_uint)
                                 continue
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -1748,7 +1748,7 @@ def _prediction_to_probabilities(
                         proba_y.shape[1], len(task.class_labels),
                     )
                     warnings.warn(message)
-                    openml.config.logger.warn(message)
+                    openml.config.logger.warning(message)
 
                     for i, col in enumerate(task.class_labels):
                         # adding missing columns with 0 probability
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -229,7 +229,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
 
         if not self.description:
             logger = logging.getLogger(__name__)
-            logger.warn("Flow % has empty description", self.name)
+            logger.warning("Flow % has empty description", self.name)
 
         flow_parameters = []
         for key in self.parameters:
diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -58,7 +58,7 @@ def get_study(
             "of things have changed since then. Please use `get_suite('OpenML100')` instead."
         )
         warnings.warn(message, DeprecationWarning)
-        openml.config.logger.warn(message)
+        openml.config.logger.warning(message)
         study = _get_study(study_id, entity_type="task")
         return cast(OpenMLBenchmarkSuite, study)  # type: ignore
     else:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -126,7 +126,7 @@ def delete_remote_files(tracker) -> None:
                 openml.utils._delete_entity(entity_type, entity)
                 logger.info("Deleted ({}, {})".format(entity_type, entity))
             except Exception as e:
-                logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e))
+                logger.warning("Cannot delete ({},{}): {}".format(entity_type, entity, e))
 
 
 def pytest_sessionstart() -> None:
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -1,7 +1,6 @@
 # License: BSD 3-Clause
 
 from time import time
-from warnings import filterwarnings, catch_warnings
 
 import numpy as np
 import pandas as pd
@@ -120,11 +119,11 @@ def test_get_data_no_str_data_for_nparrays(self):
 
     def _check_expected_type(self, dtype, is_cat, col):
         if is_cat:
-            expected_type = 'category'
-        elif not col.isna().any() and (col.astype('uint8') == col).all():
-            expected_type = 'uint8'
+            expected_type = "category"
+        elif not col.isna().any() and (col.astype("uint8") == col).all():
+            expected_type = "uint8"
         else:
-            expected_type = 'float64'
+            expected_type = "float64"
 
         self.assertEqual(dtype.name, expected_type)
 
@@ -192,14 +191,6 @@ def test_get_data_with_ignore_attributes(self):
         self.assertEqual(rval.shape, (898, 38))
         self.assertEqual(len(categorical), 38)
 
-    def test_dataset_format_constructor(self):
-
-        with catch_warnings():
-            filterwarnings("error")
-            self.assertRaises(
-                DeprecationWarning, openml.OpenMLDataset, "Test", "Test", format="arff"
-            )
-
     def test_get_data_with_nonexisting_class(self):
         # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
         # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -4,6 +4,7 @@
 import random
 from itertools import product
 from unittest import mock
+import shutil
 
 import arff
 import time
@@ -373,9 +374,9 @@ def test_get_dataset_by_name(self):
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, 'anneal')
+        self.assertEqual(dataset.name, "anneal")
         df, _, _, _ = dataset.get_data()
-        self.assertEqual(df['carbon'].dtype, 'uint8')
+        self.assertEqual(df["carbon"].dtype, "uint8")
 
     def test_get_dataset(self):
         # This is the only non-lazy load to ensure default behaviour works.
@@ -1154,27 +1155,31 @@ def test_publish_fetch_ignore_attribute(self):
         # test if publish was successful
         self.assertIsInstance(dataset.id, int)
 
+        downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id)
+        self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)
+
+    def _wait_for_dataset_being_processed(self, dataset_id):
         downloaded_dataset = None
         # fetching from server
         # loop till timeout or fetch not successful
-        max_waiting_time_seconds = 400
+        max_waiting_time_seconds = 600
         # time.time() works in seconds
         start_time = time.time()
         while time.time() - start_time < max_waiting_time_seconds:
             try:
-                downloaded_dataset = openml.datasets.get_dataset(dataset.id)
+                downloaded_dataset = openml.datasets.get_dataset(dataset_id)
                 break
             except Exception as e:
                 # returned code 273: Dataset not processed yet
                 # returned code 362: No qualities found
                 TestBase.logger.error(
-                    "Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e))
+                    "Failed to fetch dataset:{} with '{}'.".format(dataset_id, str(e))
                 )
                 time.sleep(10)
                 continue
         if downloaded_dataset is None:
-            raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset.id))
-        self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)
+            raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset_id))
+        return downloaded_dataset
 
     def test_create_dataset_row_id_attribute_error(self):
         # meta-information
@@ -1347,7 +1352,7 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(len(categorical), X.shape[1])
         self.assertEqual(len(attribute_names), X.shape[1])
 
-    def test_data_edit(self):
+    def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
         desc = (
@@ -1368,14 +1373,31 @@ def test_data_edit(self):
         edited_dataset = openml.datasets.get_dataset(did)
         self.assertEqual(edited_dataset.description, desc)
 
+    def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
-        # this is a dataset created by CI, so it is editable by this test
-        did = 315
-        result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2")
+        # for this, we need to first clone a dataset to do changes
+        did = fork_dataset(1)
+        self._wait_for_dataset_being_processed(did)
+        result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
         self.assertEqual(did, result)
-        edited_dataset = openml.datasets.get_dataset(did)
-        self.assertEqual(edited_dataset.ignore_attribute, ["col_2"])
+
+        n_tries = 10
+        # we need to wait for the edit to be reflected on the server
+        for i in range(n_tries):
+            edited_dataset = openml.datasets.get_dataset(did)
+            try:
+                self.assertEqual(edited_dataset.default_target_attribute, "shape", edited_dataset)
+                self.assertEqual(edited_dataset.ignore_attribute, ["oil"], edited_dataset)
+                break
+            except AssertionError as e:
+                if i == n_tries - 1:
+                    raise e
+                time.sleep(10)
+                # Delete the cache dir to get the newer version of the dataset
+                shutil.rmtree(
+                    os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did))
+                )
 
     def test_data_edit_errors(self):
         # Check server exception when no field to edit is provided
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -442,7 +442,7 @@ def determine_grid_size(param_grid):
             # suboptimal (slow), and not guaranteed to work if evaluation
             # engine is behind.
             # TODO: mock this? We have the arff already on the server
-            self._wait_for_processed_run(run.run_id, 400)
+            self._wait_for_processed_run(run.run_id, 600)
             try:
                 model_prime = openml.runs.initialize_model_from_trace(
                     run_id=run.run_id, repeat=0, fold=0,
@@ -519,7 +519,7 @@ def _run_and_upload_regression(
         )
 
     def test_run_and_upload_logistic_regression(self):
-        lr = LogisticRegression(solver="lbfgs")
+        lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE[0]
         n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
@@ -605,7 +605,8 @@ def get_ct_cf(nominal_indices, numeric_indices):
         LooseVersion(sklearn.__version__) < "0.20",
         reason="columntransformer introduction in 0.20.0",
     )
-    def test_run_and_upload_knn_pipeline(self):
+    @unittest.mock.patch("warnings.warn")
+    def test_run_and_upload_knn_pipeline(self, warnings_mock):
 
         cat_imp = make_pipeline(
             SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
@@ -635,11 +636,18 @@ def test_run_and_upload_knn_pipeline(self):
         n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1]
         n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2]
         self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501")
+        # The warning raised is:
+        # The total space of parameters 8 is smaller than n_iter=10.
+        # Running 8 iterations. For exhaustive searches, use GridSearchCV.'
+        # It is raised three times because we once run the model to upload something and then run
+        # it again twice to compare that the predictions are reproducible.
+        self.assertEqual(warnings_mock.call_count, 3)
 
     def test_run_and_upload_gridsearch(self):
         gridsearch = GridSearchCV(
             BaggingClassifier(base_estimator=SVC()),
             {"base_estimator__C": [0.01, 0.1, 10], "base_estimator__gamma": [0.01, 0.1, 10]},
+            cv=3,
         )
         task_id = self.TEST_SERVER_TASK_SIMPLE[0]
         n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
@@ -40,9 +40,9 @@ def test_get_train_and_test_split_indices(self):
         self.assertEqual(681, train_indices[-1])
         self.assertEqual(583, test_indices[0])
         self.assertEqual(24, test_indices[-1])
-        self.assertRaisesRegexp(
+        self.assertRaisesRegex(
             ValueError, "Fold 10 not known", task.get_train_test_split_indices, 10, 0
         )
-        self.assertRaisesRegexp(
+        self.assertRaisesRegex(
             ValueError, "Repeat 10 not known", task.get_train_test_split_indices, 0, 10
         )

Original file line number	Diff line number	Diff line change
`@@ -1748,7 +1748,7 @@ def _prediction_to_probabilities(`
`1748`	`1748`	`proba_y.shape[1], len(task.class_labels),`
`1749`	`1749`	`)`
`1750`	`1750`	`warnings.warn(message)`
`1751`		`- openml.config.logger.warn(message)`
	`1751`	`+ openml.config.logger.warning(message)`
`1752`	`1752`
`1753`	`1753`	`for i, col in enumerate(task.class_labels):`
`1754`	`1754`	`# adding missing columns with 0 probability`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def get_study(`
`58`	`58`	"of things have changed since then. Please use `get_suite('OpenML100')` instead."
`59`	`59`	`)`
`60`	`60`	`warnings.warn(message, DeprecationWarning)`
`61`		`- openml.config.logger.warn(message)`
	`61`	`+ openml.config.logger.warning(message)`
`62`	`62`	`study = _get_study(study_id, entity_type="task")`
`63`	`63`	`return cast(OpenMLBenchmarkSuite, study) # type: ignore`
`64`	`64`	`else:`