Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
fc9fe08
fix: update tests and docs for mlpstorage_py rename and uv workflow
russfellows Apr 9, 2026
1de3a84
fix: resolve all 129 unit test failures
russfellows Apr 9, 2026
0966b3d
feat: universal --file/--object flags and fix progress spinner
russfellows Apr 9, 2026
ffac5a2
refactor: consolidate object-store tests, remove hardcoded runtime pa…
russfellows Apr 9, 2026
c806d8e
fix: switch to russfellows dlio-benchmark fork; consolidate object-st…
russfellows Apr 10, 2026
aa8de4b
fix: switch dlio-benchmark ref from deleted dev branch to main
russfellows Apr 10, 2026
217ac6e
chore: update uv.lock to dlio_benchmark f58903c (PRs #9 and #10)
russfellows Apr 10, 2026
a66cda8
bug-fixes and perf enhancements for object storage, checkpointing, an…
russfellows Apr 27, 2026
9ecf1a4
fix: correct mlpstorage → mlpstorage_py references in upstream test f…
russfellows Apr 27, 2026
64165f7
Merge pull request #28 from russfellows/branch-3-0-1/bug-fixes-perf-e…
russfellows Apr 27, 2026
1210554
perf: Flux NP×RT scaling study, s3dlio-gen datagen, DLRM test results
russfellows May 12, 2026
3d19349
feat: UNet3D B200 sweep scripts, DLRM config fixes, DataLoader archit…
russfellows May 12, 2026
e184179
chore: update uv.lock for s3dlio 0.9.100 + add retinanet test scripts
russfellows May 13, 2026
6d0e761
docs: add RetinaNet NP scaling results (TorchIterableDatasetSimple, s…
russfellows May 13, 2026
b1dc6e0
chore: bump version to 3.0.2
russfellows May 13, 2026
7891ce2
chore: pin dlio-benchmark 3.0.2 from GitHub, s3dlio 0.9.100 from PyPI
russfellows May 13, 2026
2d4029c
chore: clean up tests/object-store — remove superseded scripts, archi…
russfellows May 13, 2026
ecb89ac
chore: reorganize tests/object-store — remove stale/nonstandard scrip…
russfellows May 13, 2026
14b513c
docs: rewrite tests/object-store/README.md for current structure
russfellows May 13, 2026
08eb039
docs: add Recommended Hardware section to tests/object-store/README.md
russfellows May 13, 2026
82c3517
chore: remove stale Apr-25 result docs; link to current docs/ results
russfellows May 13, 2026
022820b
cli_parser: guard --file/--object consolidation for non-benchmark sub…
idevasena May 12, 2026
03765a2
Remove unwanted file
idevasena May 12, 2026
7e4245b
Fix #363: pass results_dir to collect_cluster_info
idevasena May 9, 2026
2431011
Fix #365, #372: metadata override propagation, test suite fixes, env …
russfellows May 13, 2026
4534ae4
Merge pull request #29 from russfellows/branch-3-0-2/bug-fixes-perf-e…
russfellows May 13, 2026
fa55107
chore: merge upstream main (39e657d) — our code supersedes upstream c…
russfellows May 13, 2026
3a5195e
fix: exclude test_dlio_storage.py from pytest collection (StorageType…
russfellows May 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,23 @@ env-fast

# TLS certificates — local only, never commit (paths to certs are in .env)
.certs/

# Benchmark simulation output files
sim_*.tsv
sim_*.tsv.zst

# Sweep run logs and results (local benchmark output)
sweep_logs/
sweep_flux_master.log
results/

# Test scripts and helpers not part of the benchmark suite
test_s3dlio_gen_direct.py

# Hydra runtime output (created in cwd when running workloads with hydra config)
hydra_log/

# Timestamped sweep run logs written to repo root by sweep_*.sh scripts
sweep_unet3d_*.log
sweep_dlrm_*.log
sweep_flux_*.log
8 changes: 4 additions & 4 deletions configs/dlio/workload/dlrm_b200.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ dataset:
data_folder: data/dlrm/
format: parquet
num_files_train: 1024 # Number of training files to generate
num_samples_per_file: 4718592 # Samples per parquet file
num_samples_per_file: 1536000 # 250 RGs × 6144 → ~3.1 MiB footer (under s3-ultra 4 MiB limit)
record_length_bytes: 761
compression: none # Options: snappy, gzip, lz4, zstd, none

Expand Down Expand Up @@ -627,12 +627,12 @@ dataset:
reader:
data_loader: pytorch
batch_size: 12288
prefetch_size: 2 # Increase from default 2 for better I/O overlap
read_threads: 4 # Increase parallelism
prefetch_size: 0
read_threads: 0 # single-process, no IPC overhead; ThreadPoolExecutor handles I/O
file_shuffle: seed

train:
epochs: 1
epochs: 2
computation_time: 0.000375

metric:
Expand Down
5 changes: 3 additions & 2 deletions configs/dlio/workload/dlrm_datagen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ dataset:
data_folder: data/dlrm/
format: parquet
num_files_train: 1024 # Number of training files to generate
num_samples_per_file: 4718592 # Samples per parquet file
num_samples_per_file: 1536000 # Samples per parquet file (250 RGs × 6144 → ~3.1 MiB footer, under s3-ultra 4 MiB limit)
record_length_bytes: 761
compression: none # Options: snappy, gzip, lz4, zstd, none

# Parquet-specific configuration
parquet:
row_group_size: 8192
use_s3dlio_gen: true
row_group_size: 6144 # Match batch_size for optimal caching
read_mode: row_group

columns:
Expand Down
2 changes: 2 additions & 0 deletions configs/dlio/workload/flux_datagen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ dataset:
record_length: 2164832

parquet:
use_s3dlio_gen: true
row_group_size: 48
# Parquet-specific field specifications
columns:
- name: t5_encodings
Expand Down
40 changes: 40 additions & 0 deletions configs/dlio/workload/unet3d_b200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
model:
name: unet3d
type: cnn
model_size: 499153191

framework: pytorch

workflow:
generate_data: False
train: True
checkpoint: False

dataset:
data_folder: data/unet3d/
format: npz
num_files_train: 7200 # ~984 GiB: 7200 × ~140 MiB avg file size
num_samples_per_file: 1
record_length_bytes: 146600628
record_length_bytes_stdev: 68341808
record_length_bytes_resize: 2097152

reader:
data_loader: pytorch
batch_size: 7
read_threads: 4
file_shuffle: seed
sample_shuffle: seed

train:
epochs: 5
# B200 computation_time = H100 (0.323 s) ÷ 2 (B200 is ~2× faster than H100)
computation_time: 0.162

checkpoint:
checkpoint_folder: checkpoints/unet3d
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2

metric:
au: 0.90
Loading
Loading