diff --git a/doc/progress.rst b/doc/progress.rst index 493b029e5..3fc493914 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -6,6 +6,11 @@ Changelog ========= +next +~~~~~~ + + * MAINT #1280: Use the server-provided ``parquet_url`` instead of ``minio_url`` to determine the location of the parquet file. + 0.14.1 ~~~~~~ diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index dcdef162d..c547a7cb6 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -96,10 +96,12 @@ class OpenMLDataset(OpenMLBase): which maps a quality name to a quality value. dataset: string, optional Serialized arff dataset string. - minio_url: string, optional - URL to the MinIO bucket with dataset files + parquet_url: string, optional + This is the URL to the storage location where the dataset files are hosted. + This can be a MinIO bucket URL. If specified, the data will be accessed + from this URL when reading the files. parquet_file: string, optional - Path to the local parquet file. + Path to the local file. """ def __init__( @@ -132,7 +134,7 @@ def __init__( features_file: Optional[str] = None, qualities_file: Optional[str] = None, dataset=None, - minio_url: Optional[str] = None, + parquet_url: Optional[str] = None, parquet_file: Optional[str] = None, ): def find_invalid_characters(string, pattern): @@ -210,7 +212,7 @@ def find_invalid_characters(string, pattern): self.data_file = data_file self.parquet_file = parquet_file self._dataset = dataset - self._minio_url = minio_url + self._parquet_url = parquet_url self._features = None # type: Optional[Dict[int, OpenMLDataFeature]] self._qualities = None # type: Optional[Dict[str, float]] @@ -329,7 +331,7 @@ def _download_data(self) -> None: from .functions import _get_dataset_arff, _get_dataset_parquet self.data_file = _get_dataset_arff(self) - if self._minio_url is not None: + if self._parquet_url is not None: self.parquet_file = _get_dataset_parquet(self) def _get_arff(self, format: str) -> Dict: diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 9db702131..8d9047e6e 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -495,7 +495,7 @@ def get_dataset( qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) arff_file = _get_dataset_arff(description) if download_data else None - if "oml:minio_url" in description and download_data: + if "oml:parquet_url" in description and download_data: try: parquet_file = _get_dataset_parquet( description, download_all_files=download_all_files @@ -1062,7 +1062,7 @@ def _get_dataset_parquet( download_all_files: bool, optional (default=False) If `True`, download all data found in the bucket to which the description's - ``minio_url`` points, only download the parquet file otherwise. + ``parquet_url`` points, only download the parquet file otherwise. Returns ------- @@ -1070,10 +1070,10 @@ def _get_dataset_parquet( Location of the Parquet file if successfully downloaded, None otherwise. """ if isinstance(description, dict): - url = cast(str, description.get("oml:minio_url")) + url = cast(str, description.get("oml:parquet_url")) did = description.get("oml:id") elif isinstance(description, OpenMLDataset): - url = cast(str, description._minio_url) + url = cast(str, description._parquet_url) did = description.dataset_id else: raise TypeError("`description` should be either OpenMLDataset or Dict.") @@ -1316,7 +1316,7 @@ def _create_dataset_from_description( cache_format=cache_format, features_file=features_file, qualities_file=qualities_file, - minio_url=description.get("oml:minio_url"), + parquet_url=description.get("oml:parquet_url"), parquet_file=parquet_file, ) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index fe04f7d96..11c3bdcf6 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -439,7 +439,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self): def test__get_dataset_parquet_not_cached(self): description = { - "oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq", + "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq", "oml:id": "20", } path = _get_dataset_parquet(description, cache_directory=self.workdir) @@ -450,10 +450,10 @@ def test__get_dataset_parquet_not_cached(self): def test__get_dataset_parquet_is_cached(self, patch): openml.config.set_root_cache_directory(self.static_cache_dir) patch.side_effect = RuntimeError( - "_download_minio_file should not be called when loading from cache" + "_download_parquet_url should not be called when loading from cache" ) description = { - "oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq", + "oml:parquet_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq", "oml:id": "30", } path = _get_dataset_parquet(description, cache_directory=None) @@ -462,7 +462,7 @@ def test__get_dataset_parquet_is_cached(self, patch): def test__get_dataset_parquet_file_does_not_exist(self): description = { - "oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq", + "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq", "oml:id": "20", } path = _get_dataset_parquet(description, cache_directory=self.workdir) @@ -1416,7 +1416,7 @@ def test_get_dataset_cache_format_feather(self): # The parquet file on minio with ID 128 is not the iris dataset from the test server. dataset = openml.datasets.get_dataset(128, cache_format="feather") # Workaround - dataset._minio_url = None + dataset._parquet_url = None dataset.parquet_file = None dataset.get_data() @@ -1561,7 +1561,7 @@ def test_get_dataset_parquet(self): # There is no parquet-copy of the test server yet. openml.config.server = self.production_server dataset = openml.datasets.get_dataset(61) - self.assertIsNotNone(dataset._minio_url) + self.assertIsNotNone(dataset._parquet_url) self.assertIsNotNone(dataset.parquet_file) self.assertTrue(os.path.isfile(dataset.parquet_file))