Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
4a7aba3
add reading option for fhir
crisely09 May 28, 2025
37482f8
little reformatting
crisely09 May 28, 2025
9960ddb
add fhir dataset example
crisely09 May 28, 2025
00ccdb3
small addition to metadata
crisely09 May 28, 2025
6d6d366
added output for serviceRequest loading record-set
crisely09 May 28, 2025
d08b825
simplify a bit the metadata file
crisely09 May 28, 2025
3154634
Read JSON files faster
crisely09 May 28, 2025
f0a4093
bring back previous definition of the parse_json_content
crisely09 May 28, 2025
8eae6ec
few format fixes
crisely09 May 28, 2025
b97027b
align dataset metadata example
crisely09 May 28, 2025
79581dd
fall back to jsonpath_rw when there is recursive-descent
crisely09 May 28, 2025
c0c96c8
fix flake8
crisely09 May 28, 2025
270fe5a
Black format fixes, add tests for classes, other suggested changes
crisely09 May 30, 2025
a553715
updated output from dataset
crisely09 May 30, 2025
9aff86a
fix isort
crisely09 May 30, 2025
3fbfac9
fix test expectations
crisely09 May 30, 2025
8fe4af4
fix format
crisely09 May 30, 2025
9df8a42
fix flakes
crisely09 May 30, 2025
1aed383
fix expectation of tests
crisely09 May 30, 2025
515f346
if not replaced to if is None
crisely09 Jun 6, 2025
f477e8f
read bounding boxes all at once
crisely09 Jul 15, 2025
3fdf02d
lazy load orjson
crisely09 Jul 15, 2025
2344db9
remove imports of orjson
crisely09 Jul 15, 2025
ace9c0c
fix python format black
crisely09 Jul 23, 2025
0d61acd
run black again
crisely09 Jul 23, 2025
cd83e47
update bounding_box parsing to pass the test
crisely09 Jul 23, 2025
827fcf4
trying to include all cases for bounding boxes
crisely09 Jul 23, 2025
c2ba57f
fix format and pytype
crisely09 Jul 23, 2025
52a2532
trying to fix format errors
crisely09 Jul 23, 2025
d642034
reverted box changes, added fhir resources library and lazy load JSON…
crisely09 Feb 16, 2026
bd91ce6
cleaning
crisely09 Feb 16, 2026
bb1d109
revert notebook unintentional changes
crisely09 Feb 16, 2026
e75993e
fix metadata example
crisely09 Feb 16, 2026
fb7a7a3
More fixes
crisely09 Feb 16, 2026
acf8e14
fix mypy
crisely09 Feb 16, 2026
e804f29
Remove bounding box changes (moved to separate PR)
crisely09 Feb 16, 2026
a9d98a3
restore notebooks
crisely09 Feb 23, 2026
a762804
missing docs
crisely09 Feb 23, 2026
b20e9b6
really restore notebooks
crisely09 Feb 23, 2026
60acc8f
notebook restore
crisely09 Feb 23, 2026
b097456
make sure implementation does't break previous behaviour
crisely09 Feb 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ jobs:
apache-beam \
etils[epath] \
GitPython \
jmespath \
jsonpath_rw \
librosa \
mypy \
Expand Down Expand Up @@ -169,6 +170,7 @@ jobs:
apache-beam \
etils[epath] \
GitPython \
jmespath \
jsonpath_rw \
mypy \
networkx \
Expand Down
342 changes: 342 additions & 0 deletions datasets/1.1/pharmaccess-momcare-fhir/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,342 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"arrayShape": "cr:arrayShape",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"containedIn": "cr:containedIn",
"cr": "http://mlcommons.org/croissant/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isArray": "cr:isArray",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"prov": "http://www.w3.org/ns/prov#",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform"
},
"@type": "sc:Dataset",
"conformsTo": "http://mlcommons.org/croissant/1.1",
"name": "MomCare_dataset_FHIR_patient_serviceRequest",
"description": "This dataset is part of the MomCare dataset, that captures maternal health journeys from the MomCare program in Tanzania, structured as HL7 FHIR v4 resources. It includes over 430,000 records across nine resource types—Patient, Observation, Condition, EpisodeOfCare, Location, Organization, Questionnaire, QuestionnaireResponse, and ServiceRequest. Extracted from a relational point-of-service system and transformed using SQL-based methods, the dataset models enrollment, antenatal care, diagnoses, risk profiling, and follow-up across more than 70 clinics. All data are pseudonymized, machine-actionable, and exported in NDJSON format for interoperability, analytics, and AI integration.",
"recordSet": [
{
"@id": "serviceRequest",
"@type": "cr:RecordSet",
"name": "Servicerequest",
"description": "Automatically extracted fields from serviceRequest.ndjson",
"field": [
{
"@id": "serviceRequest/authoredOn",
"@type": "cr:Field",
"name": "authoredOn",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.authoredOn"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/identifier_value",
"@type": "cr:Field",
"name": "value",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.identifier[*].value"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/intent",
"@type": "cr:Field",
"name": "intent",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.intent"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/performer_display",
"@type": "cr:Field",
"name": "display",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.performer[*].display"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/performer_reference",
"@type": "cr:Field",
"name": "reference",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.performer[*].reference"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/performer_type",
"@type": "cr:Field",
"name": "type",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.performer[*].type"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/reasonCode_text",
"@type": "cr:Field",
"name": "text",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.reasonCode[*].text"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/requester_display",
"@type": "cr:Field",
"name": "display",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.requester.display"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/requester_reference",
"@type": "cr:Field",
"name": "reference",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.requester.reference"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/requester_type",
"@type": "cr:Field",
"name": "type",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.requester.type"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/resourceType",
"@type": "cr:Field",
"name": "resourceType",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.resourceType"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/status",
"@type": "cr:Field",
"name": "status",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.status"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/subject_reference",
"@type": "cr:Field",
"name": "reference",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.subject.reference"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
},
{
"@id": "serviceRequest/subject_type",
"@type": "cr:Field",
"name": "type",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.subject.type"
},
"fileObject": {
"@id": "resources/serviceRequest"
}
}
}
]
},
{
"@id": "patient",
"@type": "cr:RecordSet",
"name": "Patient",
"description": "Automatically extracted fields from patient.ndjson",
"field": [
{
"@id": "patient/active",
"@type": "cr:Field",
"name": "active",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.active"
},
"fileObject": {
"@id": "resources/patient"
}
}
},
{
"@id": "patient/birthDate",
"@type": "cr:Field",
"name": "birthDate",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.birthDate"
},
"fileObject": {
"@id": "resources/patient"
}
}
},
{
"@id": "patient/gender",
"@type": "cr:Field",
"name": "gender",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.gender"
},
"fileObject": {
"@id": "resources/patient"
}
}
},
{
"@id": "patient/resourceType",
"@type": "cr:Field",
"name": "resourceType",
"dataType": "sc:Text",
"source": {
"extract": {
"jsonPath": "$.resourceType"
},
"fileObject": {
"@id": "resources/patient"
}
}
}
]
}
],
"distribution": [
{
"@id": "resources/patient",
"@type": "cr:FileObject",
"contentSize": "13259393",
"contentUrl": "https://storage.googleapis.com/hanang-anonymized-maternal-care-data/patient.ndjson",
"description": "Raw FHIR patient resource exported as NDJSON.",
"encodingFormat": "application/fhir+json",
"name": "patient.ndjson",
"sha256": "e208bfb1e4b93750f48c6e406f94f9ead7e3e7d6b5b267d6077785fec5de0986"
},
{
"@id": "resources/serviceRequest",
"@type": "cr:FileObject",
"contentSize": "190572",
"contentUrl": "https://storage.googleapis.com/hanang-anonymized-maternal-care-data/serviceRequest.ndjson",
"description": "Raw FHIR serviceRequest resource exported as NDJSON.",
"encodingFormat": "application/fhir+json",
"name": "serviceRequest.ndjson",
"sha256": "da4aaffd4f57b9717a9984399bce85d764e8dc5f3b1ad5bea96acef4d4673b2d"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "1", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "0QH6R84NZEVZ6FD87G94UDQ1NT1HWK", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-04", "serviceRequest/identifier_value": "2", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "DA8DV5VNC520V4AW0DD4PY0TVFJLXG", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-12", "serviceRequest/identifier_value": "3", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "0", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Ngarenairobi RC Health centre", "serviceRequest/requester_reference": "2", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "9PAXV9DHENCMAL0MD9WCLGF6DUALRZ", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-14", "serviceRequest/identifier_value": "4", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "2", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Magugu Health Centre", "serviceRequest/requester_reference": "6", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "RV98DJ47NE093WQZYUNYR5MKD8RAL2", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-20", "serviceRequest/identifier_value": "5", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Dareda Hospital", "serviceRequest/requester_reference": "3", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "YYJM3EF040EDD1Z3Q4DE3RPEXCM9G9", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "6", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "5K8JQWC7DM4X2RJM3XYQWRU9EJ8V8P", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-06-27", "serviceRequest/identifier_value": "7", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "NURM0TRUZV8MC8WFQZPUDUC7PLR2ER", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "8", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Abortion", "serviceRequest/requester_display": "Bashnet Hospital", "serviceRequest/requester_reference": "4", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "8H5N62X4V95G8Q2THPD6X08Y745MAY", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "9", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "HYTGMF1Z301UJZE3J3ULPN4PYG5VLL", "serviceRequest/subject_type": "Patient"}
{"serviceRequest/authoredOn": "2019-07-02", "serviceRequest/identifier_value": "10", "serviceRequest/intent": "plan", "serviceRequest/performer_display": "null", "serviceRequest/performer_reference": "1", "serviceRequest/performer_type": "Organization", "serviceRequest/reasonCode_text": "Other", "serviceRequest/requester_display": "Galapo Health centre", "serviceRequest/requester_reference": "7", "serviceRequest/requester_type": "Organization", "serviceRequest/resourceType": "ServiceRequest", "serviceRequest/status": "unknown", "serviceRequest/subject_reference": "P27Q2AK2U1FDC1455WWHC32PLY763V", "serviceRequest/subject_type": "Patient"}
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ class EncodingFormat:
DICOM = "image/dicom"
JSON = "application/json"
JSON_LINES = "application/jsonlines"
FHIR = "application/fhir+json"
MP3 = "audio/mpeg"
MP4 = "video/mp4"
PARQUET = "application/x-parquet"
Expand Down
15 changes: 15 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/optional.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,20 @@ def tifffile(cls) -> types.ModuleType:
"""Cached tifffle module."""
return _try_import("tifffile", package_name="Tifffile")

@cached_class_property
def jsonpath_rw(cls) -> types.ModuleType:
"""Cached jsonpath_rw module."""
return _try_import("jsonpath_rw", package_name="jsonpath-rw")

@cached_class_property
def jmespath(cls) -> types.ModuleType:
"""Cached jmespath module."""
return _try_import("jmespath", package_name="jmespath")

@cached_class_property
def fhir_resources(cls) -> types.ModuleType:
"""Cached fhir.resources module."""
return _try_import("fhir.resources", package_name="fhir.resources")


deps = OptionalDependencies
Loading
Loading