Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions nemo_automodel/components/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,14 @@ def __exit__(self, exc_type, exc_val, exc_tb):
if not success:
logger.warning("Barrier timed out during exit, continuing anyway")
if exc_type is not None:
# TODO: propagate failure to the entire job
quit(1)
# Log the exception and make the error visible
logger.error(
"Exception inside FirstRankPerNode: %s: %s",
exc_type.__name__,
exc_val,
exc_info=(exc_type, exc_val, exc_tb),
)
raise SystemExit(1) from exc_val
finally:
if self._created_pg:
dist.destroy_process_group()
Expand Down
Loading