-
-
Notifications
You must be signed in to change notification settings - Fork 213
Parquet Support #1029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parquet Support #1029
Changes from all commits
f9f50ac
049196c
fe3dbe2
1a5ed88
3905d66
ca27f18
b4ed955
7cecfb9
a0ab074
36ede4d
f02eb0f
7ccbe5e
5099c2b
8d86c2b
117c671
56137a1
d64257d
2eeb8bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,7 @@ | |
| import io | ||
| import logging | ||
| import os | ||
| from typing import List, Dict, Union, Optional | ||
| from typing import List, Dict, Union, Optional, cast | ||
|
|
||
| import numpy as np | ||
| import arff | ||
|
|
@@ -424,6 +424,10 @@ def get_dataset( | |
| raise | ||
|
|
||
| arff_file = _get_dataset_arff(description) if download_data else None | ||
| if "oml:minio_url" in description and download_data: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please explain why we potentially download both files? I guess this is easier to handle at the moment?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main reason is that the test server currently always returns a
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. Do you think we should open an issue to track the move from arff to parquet?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like a good idea: #1032. |
||
| parquet_file = _get_dataset_parquet(description) | ||
| else: | ||
| parquet_file = None | ||
| remove_dataset_cache = False | ||
| except OpenMLServerException as e: | ||
| # if there was an exception, | ||
|
|
@@ -437,7 +441,7 @@ def get_dataset( | |
| _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) | ||
|
|
||
| dataset = _create_dataset_from_description( | ||
| description, features_file, qualities_file, arff_file, cache_format | ||
| description, features_file, qualities_file, arff_file, parquet_file, cache_format | ||
| ) | ||
| return dataset | ||
|
|
||
|
|
@@ -908,6 +912,55 @@ def _get_dataset_description(did_cache_dir, dataset_id): | |
| return description | ||
|
|
||
|
|
||
| def _get_dataset_parquet( | ||
| description: Union[Dict, OpenMLDataset], cache_directory: str = None | ||
| ) -> Optional[str]: | ||
| """ Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. | ||
|
|
||
| Checks if the file is in the cache, if yes, return the path to the file. | ||
| If not, downloads the file and caches it, then returns the file path. | ||
| The cache directory is generated based on dataset information, but can also be specified. | ||
|
|
||
| This function is NOT thread/multiprocessing safe. | ||
| Unlike the ARFF equivalent, checksums are not available/used (for now). | ||
|
|
||
| Parameters | ||
| ---------- | ||
| description : dictionary or OpenMLDataset | ||
| Either a dataset description as dict or OpenMLDataset. | ||
|
|
||
| cache_directory: str, optional (default=None) | ||
| Folder to store the parquet file in. | ||
| If None, use the default cache directory for the dataset. | ||
|
|
||
| Returns | ||
| ------- | ||
| output_filename : string, optional | ||
| Location of the Parquet file if successfully downloaded, None otherwise. | ||
| """ | ||
| if isinstance(description, dict): | ||
| url = description.get("oml:minio_url") | ||
| did = description.get("oml:id") | ||
| elif isinstance(description, OpenMLDataset): | ||
| url = description._minio_url | ||
| did = description.dataset_id | ||
| else: | ||
| raise TypeError("`description` should be either OpenMLDataset or Dict.") | ||
|
|
||
| if cache_directory is None: | ||
| cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) | ||
| output_file_path = os.path.join(cache_directory, "dataset.pq") | ||
|
|
||
| if not os.path.isfile(output_file_path): | ||
| try: | ||
| openml._api_calls._download_minio_file( | ||
| source=cast(str, url), destination=output_file_path | ||
| ) | ||
| except FileNotFoundError: | ||
| return None | ||
| return output_file_path | ||
|
|
||
|
|
||
| def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str = None) -> str: | ||
| """ Return the path to the local arff file of the dataset. If is not cached, it is downloaded. | ||
|
|
||
|
|
@@ -1031,6 +1084,7 @@ def _create_dataset_from_description( | |
| features_file: str, | ||
| qualities_file: str, | ||
| arff_file: str = None, | ||
| parquet_file: str = None, | ||
| cache_format: str = "pickle", | ||
| ) -> OpenMLDataset: | ||
| """Create a dataset object from a description dict. | ||
|
|
@@ -1045,6 +1099,8 @@ def _create_dataset_from_description( | |
| Path of the dataset qualities as xml file. | ||
| arff_file : string, optional | ||
| Path of dataset ARFF file. | ||
| parquet_file : string, optional | ||
| Path of dataset Parquet file. | ||
| cache_format: string, optional | ||
| Caching option for datasets (feather/pickle) | ||
|
|
||
|
|
@@ -1081,6 +1137,8 @@ def _create_dataset_from_description( | |
| cache_format=cache_format, | ||
| features_file=features_file, | ||
| qualities_file=qualities_file, | ||
| minio_url=description.get("oml:minio_url"), | ||
| parquet_file=parquet_file, | ||
| ) | ||
|
|
||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.