Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 30 additions & 6 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,16 +133,40 @@ def __init__(
qualities=None,
dataset=None,
):
def find_invalid_characters(string, pattern):
invalid_chars = set()
regex = re.compile(pattern)
for char in string:
if not regex.match(char):
invalid_chars.add(char)
invalid_chars = ",".join(
[
"'{}'".format(char) if char != "'" else '"{}"'.format(char)
for char in invalid_chars
]
)
return invalid_chars

if dataset_id is None:
if description and not re.match("^[\x00-\x7F]*$", description):
pattern = "^[\x00-\x7F]*$"
if description and not re.match(pattern, description):
# not basiclatin (XSD complains)
raise ValueError("Invalid symbols in description: {}".format(description))
if citation and not re.match("^[\x00-\x7F]*$", citation):
invalid_characters = find_invalid_characters(description, pattern)
raise ValueError(
"Invalid symbols {} in description: {}".format(invalid_characters, description)
)
pattern = "^[\x00-\x7F]*$"
if citation and not re.match(pattern, citation):
# not basiclatin (XSD complains)
raise ValueError("Invalid symbols in citation: {}".format(citation))
if not re.match("^[a-zA-Z0-9_\\-\\.\\(\\),]+$", name):
invalid_characters = find_invalid_characters(citation, pattern)
raise ValueError(
"Invalid symbols {} in citation: {}".format(invalid_characters, citation)
)
pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$"
if not re.match(pattern, name):
# regex given by server in error message
raise ValueError("Invalid symbols in name: {}".format(name))
invalid_characters = find_invalid_characters(name, pattern)
raise ValueError("Invalid symbols {} in name: {}".format(invalid_characters, name))
# TODO add function to check if the name is casual_string128
# Attributes received by querying the RESTful API
self.dataset_id = int(dataset_id) if dataset_id is not None else None
Expand Down
6 changes: 3 additions & 3 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ def test_repr(self):
str(data)

def test_init_string_validation(self):
with pytest.raises(ValueError, match="Invalid symbols in name"):
with pytest.raises(ValueError, match="Invalid symbols ' ' in name"):
openml.datasets.OpenMLDataset(name="some name", description="a description")

with pytest.raises(ValueError, match="Invalid symbols in description"):
with pytest.raises(ValueError, match="Invalid symbols 'ï' in description"):
openml.datasets.OpenMLDataset(name="somename", description="a descriptïon")

with pytest.raises(ValueError, match="Invalid symbols in citation"):
with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"):
openml.datasets.OpenMLDataset(
name="somename", description="a description", citation="Something by Müller"
)
Expand Down