diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 3b159f12a..02e6223d1 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -133,16 +133,40 @@ def __init__( qualities=None, dataset=None, ): + def find_invalid_characters(string, pattern): + invalid_chars = set() + regex = re.compile(pattern) + for char in string: + if not regex.match(char): + invalid_chars.add(char) + invalid_chars = ",".join( + [ + "'{}'".format(char) if char != "'" else '"{}"'.format(char) + for char in invalid_chars + ] + ) + return invalid_chars + if dataset_id is None: - if description and not re.match("^[\x00-\x7F]*$", description): + pattern = "^[\x00-\x7F]*$" + if description and not re.match(pattern, description): # not basiclatin (XSD complains) - raise ValueError("Invalid symbols in description: {}".format(description)) - if citation and not re.match("^[\x00-\x7F]*$", citation): + invalid_characters = find_invalid_characters(description, pattern) + raise ValueError( + "Invalid symbols {} in description: {}".format(invalid_characters, description) + ) + pattern = "^[\x00-\x7F]*$" + if citation and not re.match(pattern, citation): # not basiclatin (XSD complains) - raise ValueError("Invalid symbols in citation: {}".format(citation)) - if not re.match("^[a-zA-Z0-9_\\-\\.\\(\\),]+$", name): + invalid_characters = find_invalid_characters(citation, pattern) + raise ValueError( + "Invalid symbols {} in citation: {}".format(invalid_characters, citation) + ) + pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$" + if not re.match(pattern, name): # regex given by server in error message - raise ValueError("Invalid symbols in name: {}".format(name)) + invalid_characters = find_invalid_characters(name, pattern) + raise ValueError("Invalid symbols {} in name: {}".format(invalid_characters, name)) # TODO add function to check if the name is casual_string128 # Attributes received by querying the RESTful API self.dataset_id = int(dataset_id) if dataset_id is not None else None diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index fcc6eddc7..73dbfa133 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -38,13 +38,13 @@ def test_repr(self): str(data) def test_init_string_validation(self): - with pytest.raises(ValueError, match="Invalid symbols in name"): + with pytest.raises(ValueError, match="Invalid symbols ' ' in name"): openml.datasets.OpenMLDataset(name="some name", description="a description") - with pytest.raises(ValueError, match="Invalid symbols in description"): + with pytest.raises(ValueError, match="Invalid symbols 'ï' in description"): openml.datasets.OpenMLDataset(name="somename", description="a descriptïon") - with pytest.raises(ValueError, match="Invalid symbols in citation"): + with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"): openml.datasets.OpenMLDataset( name="somename", description="a description", citation="Something by Müller" )