paralleldrive · janhesters · Mar 23, 2026 · Mar 22, 2026
diff --git a/README.md b/README.md
@@ -166,6 +166,10 @@ In GitHub Actions, use `--save-responses` and upload the `ai-evals/` directory a
 
 The `if: always()` ensures responses are uploaded even when assertions fail, so you can inspect exactly what the agent produced.
 
+#### Partial results on timeout
+
+If some runs complete before another times out, the completed runs' responses are still written to the responses file. The timed-out run's partial agent output is also captured, followed by a `[RITEWAY TIMEOUT]` marker showing when and where the timeout occurred. This lets you debug why a run took too long and potentially optimize the prompt to run faster.
+
 ### Custom agent configuration
 
 `riteway ai init` writes all built-in agent configs to `riteway.agent-config.json` in your project root, so you can add custom agents or tweak existing flags:

diff --git a/source/ai-command.js b/source/ai-command.js
@@ -137,9 +137,10 @@ export const runAICommand = async ({ filePath, runs, threshold, timeout, agent,
     });
   }
 
+  const testFilename = basename(filePath);
+
   try {
     const fullPath = validateFilePath(filePath, cwd);
-    const testFilename = basename(filePath);
     const agentConfig = await resolveAgentConfig({ agent, agentConfigPath, cwd });
 
     const agentLabel = agentConfigPath
@@ -208,6 +209,22 @@ export const runAICommand = async ({ filePath, runs, threshold, timeout, agent,
     console.log('Test suite passed!');
     return outputPath;
   } catch (error) {
+    // If the error carries partial results (e.g. some runs completed before a timeout),
+    // write them to disk before re-throwing so CI artifacts capture what we have.
+    const partialResults = error.cause?.partialResults;
+    if (partialResults) {
+      try {
+        const outputPath = await recordTestOutput({
+          results: partialResults,
+          testFilename,
+          saveResponses
+        });
+        console.log(`\nPartial results recorded: ${outputPath}`);
+      } catch {
+        // Best-effort — don't mask the original error
+      }
+    }
+
     // error-causes wraps structured errors as { cause: { name, code, ... } }.
     // Presence of cause.name is the stable public contract of the library — only
     // changes if error-causes itself changes its API. Re-throw to avoid double-wrapping.

diff --git a/source/ai-command.test.js b/source/ai-command.test.js
@@ -616,6 +616,64 @@ describe('runAICommand() orchestration', () => {
     });
   });
 
+  test('writes partial results when error carries partialResults', async () => {
+    const partialResults = {
+      passed: false,
+      assertions: [{ requirement: 'Given a test, should pass', passed: true, passCount: 1, totalRuns: 1 }],
+      responses: ['Partial response from run 1']
+    };
+    const errorWithPartials = new Error('outer');
+    errorWithPartials.cause = {
+      name: 'TimeoutError',
+      code: 'AGENT_TIMEOUT',
+      message: 'Agent timed out after 300000ms',
+      partialResults
+    };
+    vi.mocked(runAITests).mockRejectedValue(errorWithPartials);
+    vi.mocked(recordTestOutput).mockClear();
+    vi.mocked(recordTestOutput).mockResolvedValue('/ai-evals/partial.tap.md');
+    const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+    onTestFinished(() => consoleSpy.mockRestore());
+
+    await Try(runAICommand, { ...args, saveResponses: true });
+
+    assert({
+      given: 'error with partialResults from a timeout',
+      should: 'call recordTestOutput with the partial results',
+      actual: vi.mocked(recordTestOutput).mock.lastCall?.[0].results,
+      expected: partialResults
+    });
+
+    assert({
+      given: 'error with partialResults and saveResponses: true',
+      should: 'forward saveResponses to recordTestOutput',
+      actual: vi.mocked(recordTestOutput).mock.lastCall?.[0].saveResponses,
+      expected: true
+    });
+  });
+
+  test('does not write partial results when error has no partialResults', async () => {
+    const plainError = new Error('outer');
+    plainError.cause = {
+      name: 'TimeoutError',
+      code: 'AGENT_TIMEOUT',
+      message: 'Agent timed out'
+    };
+    vi.mocked(runAITests).mockRejectedValue(plainError);
+    vi.mocked(recordTestOutput).mockClear();
+    const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+    onTestFinished(() => consoleSpy.mockRestore());
+
+    await Try(runAICommand, args);
+
+    assert({
+      given: 'error without partialResults',
+      should: 'not call recordTestOutput',
+      actual: vi.mocked(recordTestOutput).mock.calls.length,
+      expected: 0
+    });
+  });
+
   test('wraps unexpected errors in AITestError', async () => {
     vi.mocked(runAITests).mockRejectedValue(new Error('connection refused'));
     const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});

diff --git a/source/ai-runner.js b/source/ai-runner.js
@@ -1,4 +1,5 @@
 import { readFile } from 'fs/promises';
+import { createError } from 'error-causes';
 import { executeAgent } from './execute-agent.js';
 import { extractTests, buildResultPrompt, buildJudgePrompt } from './test-extractor.js';
 import { limitConcurrency } from './limit-concurrency.js';
@@ -120,18 +121,42 @@ const executeRuns = ({
   agentConfig,
   timeout
 }) => {
-  const runTasks = Array.from({ length: runs }, (_, runIndex) => async () =>
-    executeSingleRun({
+  const completedRuns = [];
+  const inFlightPromises = [];
+  let lastTimedOutStdout;
+
+  const runTasks = Array.from({ length: runs }, (_, runIndex) => async () => {
+    const runPromise = executeSingleRun({
       runIndex,
       extracted,
       resultPrompt,
       runs,
       agentConfig,
       timeout
-    })
-  );
+    });
+    inFlightPromises.push(runPromise);
+
+    try {
+      const result = await runPromise;
+      completedRuns.push(result);
+      return result;
+    } catch (error) {
+      const partialStdout = error.cause?.partialStdout;
+      if (partialStdout !== undefined) {
+        lastTimedOutStdout = partialStdout;
+      }
+      throw error;
+    }
+  });
 
-  return limitConcurrency(runTasks, concurrency);
+  return {
+    promise: limitConcurrency(runTasks, concurrency),
+    // Wait for all in-flight runs to settle before reading completedRuns,
+    // since Promise.all rejects immediately and other runs may still be running.
+    waitForSettled: () => Promise.allSettled(inFlightPromises),
+    completedRuns,
+    getTimedOutPartialStdout: () => lastTimedOutStdout
+  };
 };
 
 const aggregateResults = ({ assertions, allRunResults, threshold, runs }) => {
@@ -180,7 +205,7 @@ export const runAITests = async ({
 
   const { resultPrompt, assertions } = extracted;
 
-  const allRunResults = await executeRuns({
+  const { promise: runsPromise, waitForSettled, completedRuns, getTimedOutPartialStdout } = executeRuns({
     extracted,
     resultPrompt,
     runs,
@@ -189,8 +214,45 @@ export const runAITests = async ({
     timeout
   });
 
-  const aggregated = aggregateResults({ assertions, allRunResults, threshold, runs });
-  const responses = allRunResults.map(({ response }) => response);
+  try {
+    const allRunResults = await runsPromise;
+    const aggregated = aggregateResults({ assertions, allRunResults, threshold, runs });
+    const responses = allRunResults.map(({ response }) => response);
+    return { ...aggregated, responses };
+  } catch (error) {
+    await waitForSettled();
+    const lastTimedOutStdout = getTimedOutPartialStdout();
+    const hasPartialData = completedRuns.length > 0 || lastTimedOutStdout !== undefined;
+
+    if (hasPartialData) {
+      const responses = completedRuns.map(({ response }) => response);
+
+      if (lastTimedOutStdout !== undefined) {
+        const timeoutMs = error.cause?.timeout ?? timeout;
+        responses.push(
+          lastTimedOutStdout +
+          `\n\n---\n[RITEWAY TIMEOUT] Agent was terminated after ${timeoutMs}ms. Output above is partial.\n---\n`
+        );
+      }
+
+      const aggregated = completedRuns.length > 0
+        ? aggregateResults({
+          assertions,
+          allRunResults: completedRuns,
+          threshold,
+          runs: completedRuns.length
+        })
+        : { passed: false, assertions: [] };
+
+      throw createError({
+        name: error.cause?.name ?? error.name,
+        code: error.cause?.code ?? error.code,
+        message: error.cause?.message ?? error.message,
+        partialResults: { ...aggregated, responses },
+        cause: error
+      });
+    }
 
-  return { ...aggregated, responses };
+    throw error;
+  }
 };
diff --git a/source/ai-runner.test.js b/source/ai-runner.test.js
@@ -307,6 +307,152 @@ describe('runAITests()', () => {
     }
   });
 
+  test('attaches partialResults to error when a run times out after others complete', async () => {
+    const testDir = join(tmpdir(), 'riteway-test-' + createSlug());
+
+    try {
+      mkdirSync(testDir, { recursive: true });
+      writeFileSync(join(testDir, 'prompt.mdc'), 'Test prompt context');
+      const testFile = join(testDir, 'test.sudo');
+      writeFileSync(testFile, '- Given a test, should pass');
+
+      const extractedTests = [{ id: 1, requirement: 'Given a test, should pass' }];
+      const counterFile = join(testDir, 'call-count.txt');
+      writeFileSync(counterFile, '0');
+
+      // Mock agent that succeeds on first result call, hangs on second.
+      // Extraction and judge calls respond immediately.
+      const extractionResult = {
+        userPrompt: 'What is 2+2?',
+        importPaths: ['prompt.mdc'],
+        assertions: extractedTests
+      };
+      const tapYAML = '---\npassed: true\nactual: "ok"\nexpected: "ok"\nscore: 85\n---';
+
+      const mockScript = `
+        const fs = require('fs');
+        const prompt = process.argv[process.argv.length - 1];
+        if (prompt.includes('<test-file-contents>')) {
+          console.log(JSON.stringify(${JSON.stringify(extractionResult)}));
+        } else if (prompt.includes('ACTUAL RESULT TO EVALUATE')) {
+          console.log(${JSON.stringify(tapYAML)});
+        } else if (prompt.includes('CONTEXT (Prompt Under Test)')) {
+          const count = parseInt(fs.readFileSync(${JSON.stringify(counterFile)}, 'utf-8'));
+          fs.writeFileSync(${JSON.stringify(counterFile)}, String(count + 1));
+          if (count === 0) {
+            console.log('First run response');
+          } else {
+            process.stdout.write('Partial output from run 2 before timeout');
+            setTimeout(() => {}, 60000);
+          }
+        }
+      `;
+
+      const error = await Try(runAITests, {
+        filePath: testFile,
+        runs: 2,
+        threshold: 50,
+        concurrency: 1,
+        timeout: 2000,
+        projectRoot: testDir,
+        agentConfig: {
+          command: 'node',
+          args: ['-e', mockScript]
+        }
+      });
+
+      assert({
+        given: 'run 1 completes but run 2 times out',
+        should: 'throw an error with partialResults attached',
+        actual: error?.cause?.partialResults !== undefined,
+        expected: true
+      });
+
+      const responses = error?.cause?.partialResults?.responses;
+
+      assert({
+        given: 'run 1 completed successfully',
+        should: 'include the completed response as first entry',
+        actual: responses?.[0],
+        expected: 'First run response\n'
+      });
+
+      assert({
+        given: 'run 2 timed out with partial output',
+        should: 'include a second response with timeout marker',
+        actual: responses?.[1]?.includes('[RITEWAY TIMEOUT]'),
+        expected: true
+      });
+
+      assert({
+        given: 'partial results from 1 completed run',
+        should: 'include aggregated assertions',
+        actual: error?.cause?.partialResults?.assertions?.length,
+        expected: 1
+      });
+    } finally {
+      rmSync(testDir, { recursive: true, force: true });
+    }
+  });
+
+  test('includes partial stdout and timeout marker when all runs time out', async () => {
+    const testDir = join(tmpdir(), 'riteway-test-' + createSlug());
+
+    try {
+      mkdirSync(testDir, { recursive: true });
+      writeFileSync(join(testDir, 'prompt.mdc'), 'Test prompt context');
+      const testFile = join(testDir, 'test.sudo');
+      writeFileSync(testFile, '- Given a test, should pass');
+
+      const extractedTests = [{ id: 1, requirement: 'Given a test, should pass' }];
+      const extractionResult = {
+        userPrompt: 'What is 2+2?',
+        importPaths: ['prompt.mdc'],
+        assertions: extractedTests
+      };
+
+      // Mock agent that writes partial output then hangs
+      const mockScript = `
+        const prompt = process.argv[process.argv.length - 1];
+        if (prompt.includes('<test-file-contents>')) {
+          console.log(JSON.stringify(${JSON.stringify(extractionResult)}));
+        } else if (prompt.includes('CONTEXT (Prompt Under Test)')) {
+          process.stdout.write('Partial agent thoughts before timeout');
+          setTimeout(() => {}, 60000);
+        }
+      `;
+
+      const error = await Try(runAITests, {
+        filePath: testFile,
+        runs: 1,
+        threshold: 50,
+        concurrency: 1,
+        timeout: 2000,
+        projectRoot: testDir,
+        agentConfig: {
+          command: 'node',
+          args: ['-e', mockScript]
+        }
+      });
+
+      assert({
+        given: 'all runs time out but produce partial output',
+        should: 'include partialResults with the partial stdout',
+        actual: error?.cause?.partialResults?.responses?.[0]?.includes('Partial agent thoughts before timeout'),
+        expected: true
+      });
+
+      assert({
+        given: 'timed out run',
+        should: 'include timeout marker in the response',
+        actual: error?.cause?.partialResults?.responses?.[0]?.includes('[RITEWAY TIMEOUT]'),
+        expected: true
+      });
+    } finally {
+      rmSync(testDir, { recursive: true, force: true });
+    }
+  });
+
   test('throws when test file does not exist', async () => {
     const error = await Try(runAITests, {
       filePath: '/nonexistent/path/to/test.sudo',