Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ In GitHub Actions, use `--save-responses` and upload the `ai-evals/` directory a

The `if: always()` ensures responses are uploaded even when assertions fail, so you can inspect exactly what the agent produced.

#### Partial results on timeout

If some runs complete before another times out, the completed runs' responses are still written to the responses file. The timed-out run's partial agent output is also captured, followed by a `[RITEWAY TIMEOUT]` marker showing when and where the timeout occurred. This lets you debug why a run took too long and potentially optimize the prompt to run faster.

### Custom agent configuration

`riteway ai init` writes all built-in agent configs to `riteway.agent-config.json` in your project root, so you can add custom agents or tweak existing flags:
Expand Down
19 changes: 18 additions & 1 deletion source/ai-command.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,10 @@ export const runAICommand = async ({ filePath, runs, threshold, timeout, agent,
});
}

const testFilename = basename(filePath);

try {
const fullPath = validateFilePath(filePath, cwd);
const testFilename = basename(filePath);
const agentConfig = await resolveAgentConfig({ agent, agentConfigPath, cwd });

const agentLabel = agentConfigPath
Expand Down Expand Up @@ -208,6 +209,22 @@ export const runAICommand = async ({ filePath, runs, threshold, timeout, agent,
console.log('Test suite passed!');
return outputPath;
} catch (error) {
// If the error carries partial results (e.g. some runs completed before a timeout),
// write them to disk before re-throwing so CI artifacts capture what we have.
const partialResults = error.cause?.partialResults;
if (partialResults) {
try {
const outputPath = await recordTestOutput({
results: partialResults,
testFilename,
saveResponses
});
console.log(`\nPartial results recorded: ${outputPath}`);
} catch {
// Best-effort — don't mask the original error
}
}

// error-causes wraps structured errors as { cause: { name, code, ... } }.
// Presence of cause.name is the stable public contract of the library — only
// changes if error-causes itself changes its API. Re-throw to avoid double-wrapping.
Expand Down
58 changes: 58 additions & 0 deletions source/ai-command.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,64 @@ describe('runAICommand() orchestration', () => {
});
});

test('writes partial results when error carries partialResults', async () => {
const partialResults = {
passed: false,
assertions: [{ requirement: 'Given a test, should pass', passed: true, passCount: 1, totalRuns: 1 }],
responses: ['Partial response from run 1']
};
const errorWithPartials = new Error('outer');
errorWithPartials.cause = {
name: 'TimeoutError',
code: 'AGENT_TIMEOUT',
message: 'Agent timed out after 300000ms',
partialResults
};
vi.mocked(runAITests).mockRejectedValue(errorWithPartials);
vi.mocked(recordTestOutput).mockClear();
vi.mocked(recordTestOutput).mockResolvedValue('/ai-evals/partial.tap.md');
const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
onTestFinished(() => consoleSpy.mockRestore());

await Try(runAICommand, { ...args, saveResponses: true });

assert({
given: 'error with partialResults from a timeout',
should: 'call recordTestOutput with the partial results',
actual: vi.mocked(recordTestOutput).mock.lastCall?.[0].results,
expected: partialResults
});

assert({
given: 'error with partialResults and saveResponses: true',
should: 'forward saveResponses to recordTestOutput',
actual: vi.mocked(recordTestOutput).mock.lastCall?.[0].saveResponses,
expected: true
});
});

test('does not write partial results when error has no partialResults', async () => {
const plainError = new Error('outer');
plainError.cause = {
name: 'TimeoutError',
code: 'AGENT_TIMEOUT',
message: 'Agent timed out'
};
vi.mocked(runAITests).mockRejectedValue(plainError);
vi.mocked(recordTestOutput).mockClear();
const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
onTestFinished(() => consoleSpy.mockRestore());

await Try(runAICommand, args);

assert({
given: 'error without partialResults',
should: 'not call recordTestOutput',
actual: vi.mocked(recordTestOutput).mock.calls.length,
expected: 0
});
});

test('wraps unexpected errors in AITestError', async () => {
vi.mocked(runAITests).mockRejectedValue(new Error('connection refused'));
const consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
Expand Down
80 changes: 71 additions & 9 deletions source/ai-runner.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { readFile } from 'fs/promises';
import { createError } from 'error-causes';
import { executeAgent } from './execute-agent.js';
import { extractTests, buildResultPrompt, buildJudgePrompt } from './test-extractor.js';
import { limitConcurrency } from './limit-concurrency.js';
Expand Down Expand Up @@ -120,18 +121,42 @@ const executeRuns = ({
agentConfig,
timeout
}) => {
const runTasks = Array.from({ length: runs }, (_, runIndex) => async () =>
executeSingleRun({
const completedRuns = [];
const inFlightPromises = [];
let lastTimedOutStdout;

const runTasks = Array.from({ length: runs }, (_, runIndex) => async () => {
const runPromise = executeSingleRun({
runIndex,
extracted,
resultPrompt,
runs,
agentConfig,
timeout
})
);
});
inFlightPromises.push(runPromise);

try {
const result = await runPromise;
completedRuns.push(result);
return result;
} catch (error) {
const partialStdout = error.cause?.partialStdout;
if (partialStdout !== undefined) {
lastTimedOutStdout = partialStdout;
}
throw error;
}
});

return limitConcurrency(runTasks, concurrency);
return {
promise: limitConcurrency(runTasks, concurrency),
// Wait for all in-flight runs to settle before reading completedRuns,
// since Promise.all rejects immediately and other runs may still be running.
waitForSettled: () => Promise.allSettled(inFlightPromises),
completedRuns,
getTimedOutPartialStdout: () => lastTimedOutStdout
};
};

const aggregateResults = ({ assertions, allRunResults, threshold, runs }) => {
Expand Down Expand Up @@ -180,7 +205,7 @@ export const runAITests = async ({

const { resultPrompt, assertions } = extracted;

const allRunResults = await executeRuns({
const { promise: runsPromise, waitForSettled, completedRuns, getTimedOutPartialStdout } = executeRuns({
extracted,
resultPrompt,
runs,
Expand All @@ -189,8 +214,45 @@ export const runAITests = async ({
timeout
});

const aggregated = aggregateResults({ assertions, allRunResults, threshold, runs });
const responses = allRunResults.map(({ response }) => response);
try {
const allRunResults = await runsPromise;
const aggregated = aggregateResults({ assertions, allRunResults, threshold, runs });
const responses = allRunResults.map(({ response }) => response);
return { ...aggregated, responses };
} catch (error) {
await waitForSettled();
const lastTimedOutStdout = getTimedOutPartialStdout();
const hasPartialData = completedRuns.length > 0 || lastTimedOutStdout !== undefined;

if (hasPartialData) {
const responses = completedRuns.map(({ response }) => response);

if (lastTimedOutStdout !== undefined) {
const timeoutMs = error.cause?.timeout ?? timeout;
responses.push(
lastTimedOutStdout +
`\n\n---\n[RITEWAY TIMEOUT] Agent was terminated after ${timeoutMs}ms. Output above is partial.\n---\n`
);
}

const aggregated = completedRuns.length > 0
? aggregateResults({
assertions,
allRunResults: completedRuns,
threshold,
runs: completedRuns.length
})
: { passed: false, assertions: [] };

throw createError({
name: error.cause?.name ?? error.name,
code: error.cause?.code ?? error.code,
message: error.cause?.message ?? error.message,
partialResults: { ...aggregated, responses },
cause: error
});
}

return { ...aggregated, responses };
throw error;
}
};
146 changes: 146 additions & 0 deletions source/ai-runner.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,152 @@ describe('runAITests()', () => {
}
});

test('attaches partialResults to error when a run times out after others complete', async () => {
const testDir = join(tmpdir(), 'riteway-test-' + createSlug());

try {
mkdirSync(testDir, { recursive: true });
writeFileSync(join(testDir, 'prompt.mdc'), 'Test prompt context');
const testFile = join(testDir, 'test.sudo');
writeFileSync(testFile, '- Given a test, should pass');

const extractedTests = [{ id: 1, requirement: 'Given a test, should pass' }];
const counterFile = join(testDir, 'call-count.txt');
writeFileSync(counterFile, '0');

// Mock agent that succeeds on first result call, hangs on second.
// Extraction and judge calls respond immediately.
const extractionResult = {
userPrompt: 'What is 2+2?',
importPaths: ['prompt.mdc'],
assertions: extractedTests
};
const tapYAML = '---\npassed: true\nactual: "ok"\nexpected: "ok"\nscore: 85\n---';

const mockScript = `
const fs = require('fs');
const prompt = process.argv[process.argv.length - 1];
if (prompt.includes('<test-file-contents>')) {
console.log(JSON.stringify(${JSON.stringify(extractionResult)}));
} else if (prompt.includes('ACTUAL RESULT TO EVALUATE')) {
console.log(${JSON.stringify(tapYAML)});
} else if (prompt.includes('CONTEXT (Prompt Under Test)')) {
const count = parseInt(fs.readFileSync(${JSON.stringify(counterFile)}, 'utf-8'));
fs.writeFileSync(${JSON.stringify(counterFile)}, String(count + 1));
if (count === 0) {
console.log('First run response');
} else {
process.stdout.write('Partial output from run 2 before timeout');
setTimeout(() => {}, 60000);
}
}
`;

const error = await Try(runAITests, {
filePath: testFile,
runs: 2,
threshold: 50,
concurrency: 1,
timeout: 2000,
projectRoot: testDir,
agentConfig: {
command: 'node',
args: ['-e', mockScript]
}
});

assert({
given: 'run 1 completes but run 2 times out',
should: 'throw an error with partialResults attached',
actual: error?.cause?.partialResults !== undefined,
expected: true
});

const responses = error?.cause?.partialResults?.responses;

assert({
given: 'run 1 completed successfully',
should: 'include the completed response as first entry',
actual: responses?.[0],
expected: 'First run response\n'
});

assert({
given: 'run 2 timed out with partial output',
should: 'include a second response with timeout marker',
actual: responses?.[1]?.includes('[RITEWAY TIMEOUT]'),
expected: true
});

assert({
given: 'partial results from 1 completed run',
should: 'include aggregated assertions',
actual: error?.cause?.partialResults?.assertions?.length,
expected: 1
});
} finally {
rmSync(testDir, { recursive: true, force: true });
}
});

test('includes partial stdout and timeout marker when all runs time out', async () => {
const testDir = join(tmpdir(), 'riteway-test-' + createSlug());

try {
mkdirSync(testDir, { recursive: true });
writeFileSync(join(testDir, 'prompt.mdc'), 'Test prompt context');
const testFile = join(testDir, 'test.sudo');
writeFileSync(testFile, '- Given a test, should pass');

const extractedTests = [{ id: 1, requirement: 'Given a test, should pass' }];
const extractionResult = {
userPrompt: 'What is 2+2?',
importPaths: ['prompt.mdc'],
assertions: extractedTests
};

// Mock agent that writes partial output then hangs
const mockScript = `
const prompt = process.argv[process.argv.length - 1];
if (prompt.includes('<test-file-contents>')) {
console.log(JSON.stringify(${JSON.stringify(extractionResult)}));
} else if (prompt.includes('CONTEXT (Prompt Under Test)')) {
process.stdout.write('Partial agent thoughts before timeout');
setTimeout(() => {}, 60000);
}
`;

const error = await Try(runAITests, {
filePath: testFile,
runs: 1,
threshold: 50,
concurrency: 1,
timeout: 2000,
projectRoot: testDir,
agentConfig: {
command: 'node',
args: ['-e', mockScript]
}
});

assert({
given: 'all runs time out but produce partial output',
should: 'include partialResults with the partial stdout',
actual: error?.cause?.partialResults?.responses?.[0]?.includes('Partial agent thoughts before timeout'),
expected: true
});

assert({
given: 'timed out run',
should: 'include timeout marker in the response',
actual: error?.cause?.partialResults?.responses?.[0]?.includes('[RITEWAY TIMEOUT]'),
expected: true
});
} finally {
rmSync(testDir, { recursive: true, force: true });
}
});

test('throws when test file does not exist', async () => {
const error = await Try(runAITests, {
filePath: '/nonexistent/path/to/test.sudo',
Expand Down
Loading
Loading