Skip to content

Commit 93c7348

Browse files
authored
Fix task retry logic to respect retries for all exit codes (#58384)
1 parent 47f5e22 commit 93c7348

2 files changed

Lines changed: 24 additions & 7 deletions

File tree

task-sdk/src/airflow/sdk/execution_time/supervisor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,10 +1171,12 @@ def final_state(self):
11711171
if self._exit_code != 0 and self._terminal_state == SERVER_TERMINATED:
11721172
return SERVER_TERMINATED
11731173

1174-
# Any negative exit code indicates a signal kill
1175-
# We consider all signal kills as potentially retryable
1176-
# since they're often transient issues that could succeed on retry
1177-
if self._exit_code < 0 and self._should_retry:
1174+
# Any non zero exit code indicates a failure
1175+
# If retries are configured, mark as UP_FOR_RETRY
1176+
# Negative exit codes indicate signal kills (often transient)
1177+
# Positive exit codes can also be transient failures like network issues in a task communicating to
1178+
# external services
1179+
if self._exit_code != 0 and self._should_retry:
11781180
return TaskInstanceState.UP_FOR_RETRY
11791181

11801182
return TaskInstanceState.FAILED

task-sdk/tests/task_sdk/execution_time/test_supervisor.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2695,7 +2695,7 @@ def mock_upload_to_remote(process_log, ti):
26952695

26962696

26972697
class TestSignalRetryLogic:
2698-
"""Test signal based retry logic in ActivitySubprocess."""
2698+
"""Test retry logic for exit codes (signals and non-signal failures) in ActivitySubprocess."""
26992699

27002700
@pytest.mark.parametrize(
27012701
"signal",
@@ -2748,8 +2748,8 @@ def test_signals_without_retry_always_fail(self, mocker, signal):
27482748
result = mock_watched_subprocess.final_state
27492749
assert result == TaskInstanceState.FAILED
27502750

2751-
def test_non_signal_exit_code_goes_to_failed(self, mocker):
2752-
"""Test that non signal exit codes go to failed regardless of task retries."""
2751+
def test_non_signal_exit_code_with_retry_goes_to_up_for_retry(self, mocker):
2752+
"""Test that non-signal exit codes with retries enabled go to UP_FOR_RETRY."""
27532753
mock_watched_subprocess = ActivitySubprocess(
27542754
process_log=mocker.MagicMock(),
27552755
id=TI_ID,
@@ -2761,6 +2761,21 @@ def test_non_signal_exit_code_goes_to_failed(self, mocker):
27612761
mock_watched_subprocess._exit_code = 1
27622762
mock_watched_subprocess._should_retry = True
27632763

2764+
assert mock_watched_subprocess.final_state == TaskInstanceState.UP_FOR_RETRY
2765+
2766+
def test_non_signal_exit_code_without_retry_goes_to_failed(self, mocker):
2767+
"""Test that non-signal exit codes without retries enabled go to FAILED."""
2768+
mock_watched_subprocess = ActivitySubprocess(
2769+
process_log=mocker.MagicMock(),
2770+
id=TI_ID,
2771+
pid=12345,
2772+
stdin=mocker.Mock(),
2773+
process=mocker.Mock(),
2774+
client=mocker.Mock(),
2775+
)
2776+
mock_watched_subprocess._exit_code = 1
2777+
mock_watched_subprocess._should_retry = False
2778+
27642779
assert mock_watched_subprocess.final_state == TaskInstanceState.FAILED
27652780

27662781

0 commit comments

Comments
 (0)