tests:response_format

This commit is contained in:
lovebird 2025-04-02 00:34:47 +02:00
parent 2eac7ecda1
commit 20cb6861af
17 changed files with 1865 additions and 4165 deletions

View File

@ -35,6 +35,7 @@ export const runCompletion = async (client, params, options) => {
logger.info('Dry run - skipping API call');
return false;
}
// await client.beta.chat.completions.parse
const completion = await client.chat.completions.create({
model: options.model,
messages: params.messages,
@ -44,4 +45,4 @@ export const runCompletion = async (client, params, options) => {
result = await onCompletion(result, options);
return result;
};
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicnVuLWNvbXBsZXRpb24uanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvY29tbWFuZHMvcnVuLWNvbXBsZXRpb24udHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQ0EsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLFFBQVEsQ0FBQTtBQUMvQixPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFDaEQsT0FBTyxLQUFLLElBQUksTUFBTSxXQUFXLENBQUE7QUFDakMsT0FBTyxFQUFFLElBQUksSUFBSSxLQUFLLEVBQUUsTUFBTSxvQkFBb0IsQ0FBQTtBQUNsRCxPQUFPLEVBQUUsT0FBTyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFJM0MsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLGFBQWEsQ0FBQTtBQUNwQyxPQUFPLEVBQUUsWUFBWSxFQUFFLE1BQU0sb0JBQW9CLENBQUE7QUFDakQsT0FBTyxFQUFFLFlBQVksRUFBVSxNQUFNLGVBQWUsQ0FBQTtBQUNwRCxPQUFPLEVBQUUsU0FBUyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFHM0MsTUFBTSxDQUFDLE1BQU0sWUFBWSxHQUFHLEtBQUssRUFBRSxTQUFjLEVBQUUsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDekUsTUFBTSxHQUFHLFlBQVksQ0FBQyxNQUFNLEVBQUUsT0FBTyxDQUFDLE9BQW1CLElBQUksRUFBRSxDQUFDLENBQUE7SUFDaEUsTUFBTSxJQUFJLEdBQUcsU0FBUyxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQy9CLElBQUksT0FBTyxDQUFDLEdBQUcsRUFBRSxDQUFDO1FBQ2hCLE1BQU0sT0FBTyxHQUFHLElBQUksQ0FBQyxPQUFPLENBQUMsT0FBTyxDQUFDLE9BQU8sQ0FBQyxHQUFHLEVBQUUsS0FBSyxFQUFFO1lBQ3ZELEdBQUcsSUFBSTtZQUNQLEtBQUssRUFBRSxJQUFJLENBQUMsS0FBSyxDQUFDLE9BQU8sQ0FBQyxLQUFLLENBQUMsQ0FBQyxJQUFJO1lBQ3JDLE1BQU0sRUFBRSxPQUFPLENBQUMsTUFBTTtTQUN2QixDQUFDLENBQUMsQ0FBQTtRQUNILEtBQUssQ0FBQyxPQUFPLEVBQUUsTUFBTSxDQUFDLENBQUE7UUFDdEIsTUFBTSxDQUFDLEtBQUssQ0FBQyw4QkFBOEIsT0FBTyxNQUFNLE9BQU8sQ0FBQyxHQUFHLEVBQUUsQ0FBQyxDQUFBO0lBQ3hFLENBQUM7U0FBTSxDQUFDO1FBQ04sTUFBTSxDQUFDLEdBQUcsQ0FBQyxjQUFjLENBQUM7WUFDeEIsS0FBSyxFQUFFLEtBQUs7U0FDYixDQUFDLENBQUMsQ0FBQTtRQUNILE1BQU0sT0FBTyxHQUFXLE1BQU0sQ0FBQyxNQUFNLENBQVcsQ0FBQztRQUNqRCxPQUFPLENBQUMsTUFBTSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsQ0FBQTtJQUMvQixDQUFDO0lBQ0QsWUFBWSxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQ3JCLGtCQUFrQjtJQUNsQixPQUFPLE1BQU0sQ0FBQTtBQUNmLENBQUMsQ0FBQTtBQUVELE1BQU0sQ0FBQyxNQUFNLGFBQWEsR0FBRyxLQUFLLEVBQUUsTUFBYyxFQUFFLE1BQVcsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDckYsSUFBSSxPQUFPLENBQUMsR0FBRyxFQUFFLENBQUM7UUFDaEIsTUFBTSxDQUFDLElBQUksQ0FBQyw2QkFBNkIsQ0FBQyxDQUFBO1FBQzFDLE9BQU8sS0FBSyxDQUFBO0lBQ2QsQ0FBQztJQUNELE1BQU0sVUFBVSxHQUFHLE1BQU0sTUFBTSxDQUFDLElBQUksQ0FBQyxXQUFXLENBQUMsTUFBTSxDQUFDO1FBQ3RELEtBQUssRUFBRSxPQUFPLENBQUMsS0FBSztRQUNwQixRQUFRLEVBQUUsTUFBTSxDQUFDLFFBQVE7UUFDekIsZUFBZSxFQUFFLE9BQU8sQ0FBQyxNQUFhO0tBQ3ZDLENBQUMsQ0FBQTtJQUNGLElBQUksTUFBTSxHQUFHLFVBQVUsQ0FBQyxPQUFPLENBQUMsQ0FBQyxDQUFDLENBQUMsT0FBTyxDQUFDLE9BQU8sQ0FBQTtJQUNsRCxNQUFNLEdBQUcsTUFBTSxZQUFZLENBQUMsTUFBTSxFQUFFLE9BQU8sQ0FBQyxDQUFBO0lBQzVDLE9BQU8sTUFBTSxDQUFBO0FBQ2YsQ0FBQyxDQUFBIn0=
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicnVuLWNvbXBsZXRpb24uanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvY29tbWFuZHMvcnVuLWNvbXBsZXRpb24udHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQ0EsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLFFBQVEsQ0FBQTtBQUMvQixPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFDaEQsT0FBTyxLQUFLLElBQUksTUFBTSxXQUFXLENBQUE7QUFDakMsT0FBTyxFQUFFLElBQUksSUFBSSxLQUFLLEVBQUUsTUFBTSxvQkFBb0IsQ0FBQTtBQUNsRCxPQUFPLEVBQUUsT0FBTyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFJM0MsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLGFBQWEsQ0FBQTtBQUNwQyxPQUFPLEVBQUUsWUFBWSxFQUFFLE1BQU0sb0JBQW9CLENBQUE7QUFDakQsT0FBTyxFQUFFLFlBQVksRUFBVSxNQUFNLGVBQWUsQ0FBQTtBQUNwRCxPQUFPLEVBQUUsU0FBUyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFHM0MsTUFBTSxDQUFDLE1BQU0sWUFBWSxHQUFHLEtBQUssRUFBRSxTQUFjLEVBQUUsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDekUsTUFBTSxHQUFHLFlBQVksQ0FBQyxNQUFNLEVBQUUsT0FBTyxDQUFDLE9BQW1CLElBQUksRUFBRSxDQUFDLENBQUE7SUFDaEUsTUFBTSxJQUFJLEdBQUcsU0FBUyxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQy9CLElBQUksT0FBTyxDQUFDLEdBQUcsRUFBRSxDQUFDO1FBQ2hCLE1BQU0sT0FBTyxHQUFHLElBQUksQ0FBQyxPQUFPLENBQUMsT0FBTyxDQUFDLE9BQU8sQ0FBQyxHQUFHLEVBQUUsS0FBSyxFQUFFO1lBQ3ZELEdBQUcsSUFBSTtZQUNQLEtBQUssRUFBRSxJQUFJLENBQUMsS0FBSyxDQUFDLE9BQU8sQ0FBQyxLQUFLLENBQUMsQ0FBQyxJQUFJO1lBQ3JDLE1BQU0sRUFBRSxPQUFPLENBQUMsTUFBTTtTQUN2QixDQUFDLENBQUMsQ0FBQTtRQUNILEtBQUssQ0FBQyxPQUFPLEVBQUUsTUFBTSxDQUFDLENBQUE7UUFDdEIsTUFBTSxDQUFDLEtBQUssQ0FBQyw4QkFBOEIsT0FBTyxNQUFNLE9BQU8sQ0FBQyxHQUFHLEVBQUUsQ0FBQyxDQUFBO0lBQ3hFLENBQUM7U0FBTSxDQUFDO1FBQ04sTUFBTSxDQUFDLEdBQUcsQ0FBQyxjQUFjLENBQUM7WUFDeEIsS0FBSyxFQUFFLEtBQUs7U0FDYixDQUFDLENBQUMsQ0FBQTtRQUNILE1BQU0sT0FBTyxHQUFXLE1BQU0sQ0FBQyxNQUFNLENBQVcsQ0FBQztRQUNqRCxPQUFPLENBQUMsTUFBTSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsQ0FBQTtJQUMvQixDQUFDO0lBQ0QsWUFBWSxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQ3JCLGtCQUFrQjtJQUNsQixPQUFPLE1BQU0sQ0FBQTtBQUNmLENBQUMsQ0FBQTtBQUVELE1BQU0sQ0FBQyxNQUFNLGFBQWEsR0FBRyxLQUFLLEVBQUUsTUFBYyxFQUFFLE1BQVcsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDckYsSUFBSSxPQUFPLENBQUMsR0FBRyxFQUFFLENBQUM7UUFDaEIsTUFBTSxDQUFDLElBQUksQ0FBQyw2QkFBNkIsQ0FBQyxDQUFBO1FBQzFDLE9BQU8sS0FBSyxDQUFBO0lBQ2QsQ0FBQztJQUNELDJDQUEyQztJQUMzQyxNQUFNLFVBQVUsR0FBRyxNQUFNLE1BQU0sQ0FBQyxJQUFJLENBQUMsV0FBVyxDQUFDLE1BQU0sQ0FBQztRQUN0RCxLQUFLLEVBQUUsT0FBTyxDQUFDLEtBQUs7UUFDcEIsUUFBUSxFQUFFLE1BQU0sQ0FBQyxRQUFRO1FBQ3pCLGVBQWUsRUFBRSxPQUFPLENBQUMsTUFBYTtLQUN2QyxDQUFDLENBQUE7SUFDRixJQUFJLE1BQU0sR0FBRyxVQUFVLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLE9BQU8sQ0FBQyxPQUFPLENBQUE7SUFDbEQsTUFBTSxHQUFHLE1BQU0sWUFBWSxDQUFDLE1BQU0sRUFBRSxPQUFPLENBQUMsQ0FBQTtJQUM1QyxPQUFPLE1BQU0sQ0FBQTtBQUNmLENBQUMsQ0FBQSJ9

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,9 @@
{
"model": "anthropic/claude-2.0",
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": "translate \"no\" to French. Return only the translated word, no explanation."
"content": "Generate a random number."
},
{
"role": "user",

View File

@ -19,6 +19,7 @@
"test": "vitest run",
"test:basic": "vitest run tests/unit/basic.test.ts",
"test:math": "vitest run tests/unit/math.test.ts",
"test:format": "vitest run tests/unit/format.test.ts",
"test:language": "vitest run tests/unit/language.test.ts",
"test2:watch": "vitest",
"test2:coverage": "vitest run --coverage",

View File

@ -56,8 +56,7 @@ export const models = () => {
const openRouterPath = path.resolve(OPENROUTER_CACHE_PATH)
if (!exists(openRouterPath)) {
fetchOpenRouterModels()
}
if (exists(openRouterPath)) {
}else{
const modelData: OpenRouterCachedModels = read(openRouterPath, 'json') as OpenRouterCachedModels
models.push(chalk.magenta.bold('\n OpenRouter models:\n'))
models.push(...listOpenRouterModelsAsStrings(modelData.models))
@ -85,8 +84,7 @@ export const all = () => {
const openRouterPath = path.resolve(OPENROUTER_CACHE_PATH)
if (!exists(openRouterPath)) {
fetchOpenRouterModels()
}
if (exists(openRouterPath)) {
}else{
const modelData: OpenRouterCachedModels = read(openRouterPath, 'json') as OpenRouterCachedModels
models = models.concat(modelData.models)
}

View File

@ -2,171 +2,56 @@
## Failed Tests
### basic_arithmetic - deepseek/deepseek-chat:free
- Prompt: `return the result of 2+2, dont comment`
- Expected: `undefined`
- Actual: `4`
- Reason: undefined
- Timestamp: 4/1/2025, 12:26:30 PM
### basic_arithmetic - google/gemini-2.0-flash-exp:free
- Prompt: `return the result of 2+2, dont comment`
- Expected: `undefined`
- Actual: `4
`
- Reason: undefined
- Timestamp: 4/1/2025, 12:26:31 PM
### basic_arithmetic - gpt-4
- Prompt: `return the result of 2+2, dont comment`
- Expected: `undefined`
- Actual: `4`
- Reason: undefined
- Timestamp: 4/1/2025, 12:26:32 PM
### json_structure - deepseek/deepseek-chat:free
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
- Expected: `undefined`
- Actual: `{"name":"test","value":42}`
- Reason: undefined
- Timestamp: 4/1/2025, 12:26:33 PM
### json_structure - gpt-4
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
- Expected: `undefined`
- Actual: `{"name": "test", "value": 42}`
- Reason: undefined
- Timestamp: 4/1/2025, 12:26:36 PM
### json_structure - google/gemini-2.0-flash-exp:free
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
- Expected: `undefined`
- Actual: `{
"name": "test",
"value": 42
}`
- Reason: undefined
- Timestamp: 4/1/2025, 12:26:35 PM
### hello - deepseek/deepseek-chat:free
- Prompt: `say "hello"`
- Expected: `hello`
- Actual: ``
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:36:37 PM
### hello - google/gemini-2.0-flash-exp:free
- Prompt: `say "hello"`
- Expected: `hello`
- Actual: ``
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:36:37 PM
### hello - gpt-4
- Prompt: `say "hello"`
- Expected: `hello`
- Actual: ``
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:36:42 PM
### goodbye - deepseek/deepseek-chat:free
- Prompt: `say "goodbye"`
- Expected: `goodbye`
- Actual: ``
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:36:42 PM
### goodbye - google/gemini-2.0-flash-exp:free
- Prompt: `say "goodbye"`
- Expected: `goodbye`
- Actual: ``
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:36:43 PM
### goodbye - gpt-4
- Prompt: `say "goodbye"`
- Expected: `goodbye`
- Actual: ``
- Reason: expected 'goodbye.' to deeply equal 'goodbye'
- Timestamp: 4/1/2025, 1:36:44 PM
### yes - deepseek/deepseek-chat:free
- Prompt: `say "yes"`
- Expected: `yes`
- Actual: ``
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:36:45 PM
### yes - google/gemini-2.0-flash-exp:free
- Prompt: `say "yes"`
- Expected: `yes`
- Actual: ``
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:36:45 PM
### yes - gpt-4
- Prompt: `say "yes"`
- Expected: `yes`
- Actual: ``
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:36:46 PM
*No failed tests*
## Passed Tests
### addition - deepseek/deepseek-chat:free
### addition - anthropic/claude-3.5-sonnet
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Timestamp: 4/1/2025, 12:59:06 PM
- Duration: 1551ms
- Timestamp: 4/2/2025, 12:17:39 AM
### addition - google/gemini-2.0-flash-exp:free
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8
`
- Timestamp: 4/1/2025, 12:59:08 PM
### addition - gpt-4
### addition - qwen/qwq-32b
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Timestamp: 4/1/2025, 1:39:04 PM
- Duration: 3621ms
- Timestamp: 4/2/2025, 12:17:42 AM
### multiplication - deepseek/deepseek-chat:free
### multiplication - anthropic/claude-3.5-sonnet
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Timestamp: 4/1/2025, 12:59:13 PM
- Duration: 873ms
- Timestamp: 4/2/2025, 12:17:43 AM
### multiplication - google/gemini-2.0-flash-exp:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24
`
- Timestamp: 4/1/2025, 12:59:15 PM
### multiplication - gpt-4
### multiplication - qwen/qwq-32b
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Timestamp: 4/1/2025, 1:39:06 PM
- Duration: 3472ms
- Timestamp: 4/2/2025, 12:17:47 AM
### division - deepseek/deepseek-chat:free
### division - anthropic/claude-3.5-sonnet
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Timestamp: 4/1/2025, 12:59:18 PM
- Duration: 1183ms
- Timestamp: 4/2/2025, 12:17:48 AM
### division - google/gemini-2.0-flash-exp:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5
`
- Timestamp: 4/1/2025, 12:56:09 PM
### division - gpt-4
### division - qwen/qwq-32b
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Timestamp: 4/1/2025, 1:39:08 PM
- Duration: 4841ms
- Timestamp: 4/2/2025, 12:17:53 AM
## Summary
- Total Tests: 6
- Passed: 6
- Failed: 0
- Success Rate: 100.00%

File diff suppressed because it is too large Load Diff

View File

@ -4,153 +4,121 @@ import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
import {
models,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse
} from './commons'
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
describe('Basic Operations', () => {
let testResults: TestResult[] = []
// Load existing results if any
if (exists(TEST_LOG_PATH)) {
const data = read(TEST_LOG_PATH, 'json')
testResults = Array.isArray(data) ? data : []
const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
testResult = {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
}
} catch (e) {
error = formatError(e)
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred'
}
throw e
} finally {
if (testResult) {
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
}
it.each(models)('should add two numbers with model %s', async (modelName) => {
const prompt = 'add 5 and 3. Return only the number, no explanation.'
const expected = '8'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim() || ''
if (!actual) {
console.log(`Skipping test for model ${modelName} - no result returned`)
return
}
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'addition',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
})
await runBasicTest(
'add 5 and 3. Return only the number, no explanation.',
'8',
'addition',
modelName
)
}, { timeout: 10000 })
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
const prompt = 'multiply 8 and 3. Return only the number, no explanation.'
const expected = '24'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim() || ''
if (!actual) {
console.log(`Skipping test for model ${modelName} - no result returned`)
return
}
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'multiplication',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
})
await runBasicTest(
'multiply 8 and 3. Return only the number, no explanation.',
'24',
'multiplication',
modelName
)
}, { timeout: 10000 })
it.each(models)('should divide two numbers with model %s', async (modelName) => {
const prompt = 'divide 15 by 3. Return only the number, no explanation.'
const expected = '5'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim() || ''
if (!actual) {
console.log(`Skipping test for model ${modelName} - no result returned`)
return
}
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'division',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
})
await runBasicTest(
'divide 15 by 3. Return only the number, no explanation.',
'5',
'division',
modelName
)
}, { timeout: 10000 })
it('should generate markdown report', () => {
// Group results by test and model
@ -173,32 +141,64 @@ describe('Basic Operations', () => {
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
if (result.error.details?.message) {
report += `- Error Details: ${result.error.details.message}\n`
}
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Add summary section
report += '## Summary\n\n'
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
// Write report to file
const reportPath = path.resolve(__dirname, './basic-report.md')

View File

@ -3,7 +3,8 @@ import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL } from '../
export const models = [
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_2_0
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B
]
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')

View File

@ -2,546 +2,224 @@
## Failed Tests
### basic_structure - deepseek/deepseek-chat:free
- Prompt: `return a greeting "hello" with count 42`
- Expected: `{"greeting":"hello","count":42}`
- Actual: `""`
- Duration: 885ms
- Error Type: Error
### json-schema-file-format - anthropic/claude-3.5-sonnet
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
- Actual: ``
- Duration: 1690ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:36 PM
- Error Message: expected '{\n "name": "John Doe",\n "age": 30…' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
- Error Details: expected '{\n "name": "John Doe",\n "age": 30…' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
- Reason: expected '{\n "name": "John Doe",\n "age": 30…' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
- Timestamp: 4/2/2025, 12:23:43 AM
### basic_structure - google/gemini-2.0-flash-exp:free
- Prompt: `return a greeting "hello" with count 42`
- Expected: `{"greeting":"hello","count":42}`
- Actual: `""`
- Duration: 757ms
- Error Type: Error
### json-schema-file-format - qwen/qwq-32b
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
- Actual: ``
- Duration: 3426ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:36 PM
- Error Message: expected '"John Doe",\n "age": 30,\n "tags": …' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
- Error Details: expected '"John Doe",\n "age": 30,\n "tags": …' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
- Reason: expected '"John Doe",\n "age": 30,\n "tags": …' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
- Timestamp: 4/2/2025, 12:23:47 AM
### basic_structure - gpt-4
- Prompt: `return a greeting "hello" with count 42`
- Expected: `{"greeting":"hello","count":42}`
- Actual: `""`
- Duration: 1043ms
- Error Type: Error
### json-schema-object-format - anthropic/claude-3.5-sonnet
- Prompt: `Create a user profile with name Jane Smith, age 25, and tags ["designer", "ui"]. Return only the JSON object, no explanation.`
- Expected: `{"name":"Jane Smith","age":25,"tags":["designer","ui"]}`
- Actual: ``
- Duration: 1918ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:37 PM
- Error Message: expected '{\n "name": "Jane Smith",\n "age": …' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
- Error Details: expected '{\n "name": "Jane Smith",\n "age": …' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
- Reason: expected '{\n "name": "Jane Smith",\n "age": …' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
- Timestamp: 4/2/2025, 12:23:49 AM
### basic_structure - anthropic/claude-3.7-sonnet
- Prompt: `return a greeting "hello" with count 42`
- Expected: `{"greeting":"hello","count":42}`
- Actual: `""`
- Duration: 1790ms
- Error Type: Error
### json-schema-object-format - qwen/qwq-32b
- Prompt: `Create a user profile with name Jane Smith, age 25, and tags ["designer", "ui"]. Return only the JSON object, no explanation.`
- Expected: `{"name":"Jane Smith","age":25,"tags":["designer","ui"]}`
- Actual: ``
- Duration: 9789ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: Unexpected token 'h', "hello 42" is not valid JSON
- Reason: Failed to parse or validate response: Unexpected token 'h', "hello 42" is not valid JSON
- Timestamp: 4/1/2025, 1:23:05 PM
- Error Message: expected '{"name": "Jane Smith", "age": 25, "ta…' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
- Error Details: expected '{"name": "Jane Smith", "age": 25, "ta…' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
- Reason: expected '{"name": "Jane Smith", "age": 25, "ta…' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
- Timestamp: 4/2/2025, 12:23:58 AM
### basic_structure - openai/gpt-4
- Prompt: `Return a JSON object with a greeting "hello" and count 42. The response must be valid JSON with exactly these fields: { "greeting": string, "count": number }`
- Expected: `{"greeting":"hello","count":42}`
- Actual: `""`
- Duration: 1258ms
- Error Type: Error
### json-schema-object-format - gpt-4o
- Prompt: `Create a user profile with the following details:
- Name: Jane Smith
- Age: 25
- Email: jane.smith@company.com
- Tags: ["developer", "designer"]
- Address: 123 Main St, New York, US, 10001
- Preferences: light theme, notifications enabled, English language
Return only the JSON object, no explanation.`
- Expected: `{"name":"Jane Smith","age":25,"email":"jane.smith@company.com","tags":["developer","designer"],"address":{"street":"123 Main St","city":"New York","country":"US","zipCode":"10001"},"preferences":{"theme":"light","notifications":true,"language":"en"}}`
- Actual: ``
- Duration: 2618ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Invalid response from API
- Reason: Invalid response from API
- Timestamp: 4/1/2025, 1:32:43 PM
- Error Message: expected '{"name":"Jane Smith","age":25,"email"…' to deeply equal '{"name":"Jane Smith","age":25,"email"…'
- Error Details: expected '{"name":"Jane Smith","age":25,"email"…' to deeply equal '{"name":"Jane Smith","age":25,"email"…'
- Reason: expected '{"name":"Jane Smith","age":25,"email"…' to deeply equal '{"name":"Jane Smith","age":25,"email"…'
- Timestamp: 4/2/2025, 12:33:08 AM
### nested_structure - deepseek/deepseek-chat:free
- Prompt: `return user John age 30 with dark theme and notifications enabled`
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
- Actual: `""`
- Duration: 655ms
- Error Type: Error
### zod-string-format - anthropic/claude-3.5-sonnet
- Prompt: `Generate a valid email address for a business domain. Return only the email, no explanation.`
- Expected: `john.doe@company.com`
- Actual: ``
- Duration: 1347ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:38 PM
- Error Message: expected 'sales@companyplus.com' to deeply equal 'john.doe@company.com'
- Error Details: expected 'sales@companyplus.com' to deeply equal 'john.doe@company.com'
- Reason: expected 'sales@companyplus.com' to deeply equal 'john.doe@company.com'
- Timestamp: 4/2/2025, 12:24:00 AM
### nested_structure - google/gemini-2.0-flash-exp:free
- Prompt: `return user John age 30 with dark theme and notifications enabled`
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
- Actual: `""`
- Duration: 790ms
- Error Type: Error
### zod-string-format - qwen/qwq-32b
- Prompt: `Generate a valid email address for a business domain. Return only the email, no explanation.`
- Expected: `john.doe@company.com`
- Actual: ``
- Duration: 13704ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:39 PM
- Error Message: expected 'info@techstart.com' to deeply equal 'john.doe@company.com'
- Error Details: expected 'info@techstart.com' to deeply equal 'john.doe@company.com'
- Reason: expected 'info@techstart.com' to deeply equal 'john.doe@company.com'
- Timestamp: 4/2/2025, 12:24:13 AM
### nested_structure - gpt-4
- Prompt: `return user John age 30 with dark theme and notifications enabled`
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
- Actual: `""`
- Duration: 717ms
- Error Type: Error
### zod-string-format - gpt-4o
- Prompt: `Generate a valid email address for a business domain. Return only the email, no explanation.`
- Expected: `john.doe@company.com`
- Actual: ``
- Duration: 1794ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:40 PM
- Error Message: expected 'contact@businessdomain.com' to deeply equal 'john.doe@company.com'
- Error Details: expected 'contact@businessdomain.com' to deeply equal 'john.doe@company.com'
- Reason: expected 'contact@businessdomain.com' to deeply equal 'john.doe@company.com'
- Timestamp: 4/2/2025, 12:32:20 AM
### nested_structure - anthropic/claude-3.7-sonnet
- Prompt: `return user John age 30 with dark theme and notifications enabled`
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
- Actual: `""`
- Duration: 1189ms
- Error Type: Error
### zod-number-format - anthropic/claude-3.5-sonnet
- Prompt: `Generate a random age between 18 and 65. Return only the number, no explanation.`
- Expected: `25`
- Actual: ``
- Duration: 1376ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: Unexpected token '#', "# John's U"... is not valid JSON
- Reason: Failed to parse or validate response: Unexpected token '#', "# John's U"... is not valid JSON
- Timestamp: 4/1/2025, 1:23:06 PM
- Error Message: expected '42' to deeply equal '25'
- Error Details: expected '42' to deeply equal '25'
- Reason: expected '42' to deeply equal '25'
- Timestamp: 4/2/2025, 12:24:11 AM
### nested_structure - openai/gpt-4
- Prompt: `Return a JSON object with user John age 30, dark theme and notifications enabled. The response must be valid JSON with this structure: { "user": { "name": string, "age": number }, "settings": { "theme": string, "notifications": boolean } }`
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
- Actual: `""`
- Duration: 716ms
- Error Type: Error
### zod-number-format - gpt-4o
- Prompt: `Generate a random age between 18 and 65. Return only the number, no explanation.`
- Expected: `25`
- Actual: ``
- Duration: 2399ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Invalid response from API
- Reason: Invalid response from API
- Timestamp: 4/1/2025, 1:32:44 PM
- Error Message: expected '39' to deeply equal '25'
- Error Details: expected '39' to deeply equal '25'
- Reason: expected '39' to deeply equal '25'
- Timestamp: 4/2/2025, 12:32:23 AM
### array_structure - deepseek/deepseek-chat:free
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
- Actual: `""`
- Duration: 617ms
- Error Type: Error
### zod-array-format - anthropic/claude-3.5-sonnet
- Prompt: `Generate a list of 3 programming languages. Return only the array, no explanation.`
- Expected: `["JavaScript","Python","Java"]`
- Actual: ``
- Duration: 1009ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:40 PM
- Error Message: expected '["Python", "Java", "JavaScript"]' to deeply equal '["JavaScript","Python","Java"]'
- Error Details: expected '["Python", "Java", "JavaScript"]' to deeply equal '["JavaScript","Python","Java"]'
- Reason: expected '["Python", "Java", "JavaScript"]' to deeply equal '["JavaScript","Python","Java"]'
- Timestamp: 4/2/2025, 12:24:22 AM
### array_structure - google/gemini-2.0-flash-exp:free
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
- Actual: `""`
- Duration: 756ms
- Error Type: Error
### zod-array-format - qwen/qwq-32b
- Prompt: `Generate a list of 3 programming languages. Return only the array, no explanation.`
- Expected: `["JavaScript","Python","Java"]`
- Actual: ``
- Duration: 4147ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:41 PM
- Error Message: expected '["Python", "JavaScript", "Java"]' to deeply equal '["JavaScript","Python","Java"]'
- Error Details: expected '["Python", "JavaScript", "Java"]' to deeply equal '["JavaScript","Python","Java"]'
- Reason: expected '["Python", "JavaScript", "Java"]' to deeply equal '["JavaScript","Python","Java"]'
- Timestamp: 4/2/2025, 12:24:26 AM
### array_structure - gpt-4
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
- Actual: `""`
### zod-array-format - gpt-4o
- Prompt: `Generate a list of 3 programming languages. Return only the array, no explanation.`
- Expected: `["JavaScript","Python","Java"]`
- Actual: ``
- Duration: 693ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: expected '["Python","JavaScript","Java"]' to deeply equal '["JavaScript","Python","Java"]'
- Error Details: expected '["Python","JavaScript","Java"]' to deeply equal '["JavaScript","Python","Java"]'
- Reason: expected '["Python","JavaScript","Java"]' to deeply equal '["JavaScript","Python","Java"]'
- Timestamp: 4/2/2025, 12:32:23 AM
### invalid-format - anthropic/claude-3.5-sonnet
- Prompt: `Generate a random number.`
- Expected: `Invalid format option`
- Actual: ``
- Duration: 1026ms
- Error Type: Error
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:42 PM
- Error Message: expected '73' to deeply equal 'Invalid format option'
- Error Details: expected '73' to deeply equal 'Invalid format option'
- Reason: expected '73' to deeply equal 'Invalid format option'
- Timestamp: 4/2/2025, 12:24:27 AM
### array_structure - anthropic/claude-3.7-sonnet
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
- Actual: `""`
- Duration: 1190ms
- Error Type: Error
### invalid-format - qwen/qwq-32b
- Prompt: `Generate a random number.`
- Expected: `Invalid format option`
- Actual: ``
- Duration: 7614ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:23:08 PM
- Error Message: expected '72' to deeply equal 'Invalid format option'
- Error Details: expected '72' to deeply equal 'Invalid format option'
- Reason: expected '72' to deeply equal 'Invalid format option'
- Timestamp: 4/2/2025, 12:24:35 AM
### array_structure - openai/gpt-4
- Prompt: `Return a JSON object with a list of 2 items. The response must be valid JSON with this structure: { "items": [{ "id": number, "name": string }] }. The first item should have id 1 and name "first", the second item should have id 2 and name "second".`
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
- Actual: `""`
- Duration: 703ms
- Error Type: Error
### invalid-format - gpt-4o
- Prompt: `Generate a random number.`
- Expected: `Invalid format option`
- Actual: ``
- Duration: 826ms
- Error Type: AssertionError
- Error Code: UNKNOWN
- Error Message: Invalid response from API
- Reason: Invalid response from API
- Timestamp: 4/1/2025, 1:32:44 PM
### enum_structure - deepseek/deepseek-chat:free
- Prompt: `return status success with message "Operation completed"`
- Expected: `{"status":"success","message":"Operation completed"}`
- Actual: `""`
- Duration: 647ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:43 PM
### enum_structure - google/gemini-2.0-flash-exp:free
- Prompt: `return status success with message "Operation completed"`
- Expected: `{"status":"success","message":"Operation completed"}`
- Actual: `""`
- Duration: 813ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:43 PM
### enum_structure - gpt-4
- Prompt: `return status success with message "Operation completed"`
- Expected: `{"status":"success","message":"Operation completed"}`
- Actual: `""`
- Duration: 1138ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:45 PM
### enum_structure - anthropic/claude-3.7-sonnet
- Prompt: `return status success with message "Operation completed"`
- Expected: `{"status":"success","message":"Operation completed"}`
- Actual: `""`
- Duration: 1728ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: Unexpected token '`', "```json
{
"... is not valid JSON
- Reason: Failed to parse or validate response: Unexpected token '`', "```json
{
"... is not valid JSON
- Timestamp: 4/1/2025, 1:23:09 PM
### enum_structure - openai/gpt-4
- Prompt: `Return a JSON object with status "success" and message "Operation completed". The response must be valid JSON with this structure: { "status": "success" | "error" | "pending", "message": string }`
- Expected: `{"status":"success","message":"Operation completed"}`
- Actual: `""`
- Duration: 688ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Invalid response from API
- Reason: Invalid response from API
- Timestamp: 4/1/2025, 1:32:45 PM
### optional_fields - deepseek/deepseek-chat:free
- Prompt: `return name "John" with age 30 and email "john@example.com"`
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
- Actual: `""`
- Duration: 676ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:45 PM
### optional_fields - google/gemini-2.0-flash-exp:free
- Prompt: `return name "John" with age 30 and email "john@example.com"`
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
- Actual: `""`
- Duration: 884ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:46 PM
### optional_fields - gpt-4
- Prompt: `return name "John" with age 30 and email "john@example.com"`
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
- Actual: `""`
- Duration: 669ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Reason: Failed to parse or validate response: [
{
"code": "invalid_type",
"expected": "object",
"received": "null",
"path": [],
"message": "Expected object, received null"
}
]
- Timestamp: 4/1/2025, 1:21:47 PM
### optional_fields - anthropic/claude-3.7-sonnet
- Prompt: `return name "John" with age 30 and email "john@example.com"`
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
- Actual: `""`
- Duration: 1576ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Failed to parse or validate response: Unexpected token '`', "```json
{
"... is not valid JSON
- Reason: Failed to parse or validate response: Unexpected token '`', "```json
{
"... is not valid JSON
- Timestamp: 4/1/2025, 1:23:11 PM
### optional_fields - openai/gpt-4
- Prompt: `Return a JSON object with name "John", age 30, and email "john@example.com". The response must be valid JSON with this structure: { "name": string, "age"?: number, "email"?: string }`
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
- Actual: `""`
- Duration: 682ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Invalid response from API
- Reason: Invalid response from API
- Timestamp: 4/1/2025, 1:32:46 PM
- Error Message: expected '786984' to deeply equal 'Invalid format option'
- Error Details: expected '786984' to deeply equal 'Invalid format option'
- Reason: expected '786984' to deeply equal 'Invalid format option'
- Timestamp: 4/2/2025, 12:32:24 AM
## Passed Tests
*No passed tests*
### json-schema-file-format - gpt-4o
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
- Actual: ````json
{
"name": "John Doe",
"age": 30,
"tags": ["developer", "javascript"]
}
````
- Duration: 995ms
- Timestamp: 4/2/2025, 12:34:28 AM
## Summary
- Total Tests: 38
- Passed: 5
- Failed: 33
- Success Rate: 13.16%

File diff suppressed because it is too large Load Diff

View File

@ -1,295 +1,508 @@
import { describe, it, expect } from 'vitest'
import { run } from '../../src/index'
import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { z } from 'zod'
import {
models_premium as models,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse,
getRouterForModel,
getApiKeyForRouter
} from './commons'
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
describe('Structured Output Format', () => {
let testResults: TestResult[] = []
// Load existing results if any
if (exists(TEST_LOG_PATH)) {
const data = read(TEST_LOG_PATH, 'json')
testResults = Array.isArray(data) ? data : []
}
const runFormatTest = async (prompt: string, format: z.ZodType<any>, expected: any, testName: string, modelName: string) => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
format,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as any[]
const actual = result?.[0]
let parsed: any
try {
parsed = typeof actual === 'string' ? JSON.parse(actual) : actual
// Validate against the format schema
parsed = format.parse(parsed)
} catch (parseError) {
throw new Error(`Failed to parse or validate response: ${parseError.message}`)
}
const passed = JSON.stringify(parsed) === JSON.stringify(expected) && !isEmptyResponse(result)
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
expect(parsed).toEqual(expected)
return {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${JSON.stringify(expected)}, but got ${JSON.stringify(parsed)}`,
config: {
router: getRouterForModel(modelName)
}
}
} catch (e) {
error = formatError(e)
throw e
} finally {
const testResult: TestResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred',
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
it.each(models)('should return basic structured output with model %s', async (modelName) => {
const format = z.object({
greeting: z.string(),
count: z.number()
})
await runFormatTest(
'return a greeting "hello" with count 42',
format,
{ greeting: 'hello', count: 42 },
'basic_structure',
modelName
)
})
it.each(models)('should handle nested structures with model %s', async (modelName) => {
const format = z.object({
user: z.object({
name: z.string(),
age: z.number()
}),
settings: z.object({
theme: z.string(),
notifications: z.boolean()
})
})
await runFormatTest(
'return user John age 30 with dark theme and notifications enabled',
format,
{
user: { name: 'John', age: 30 },
settings: { theme: 'dark', notifications: true }
},
'nested_structure',
modelName
)
})
it.each(models)('should handle arrays with model %s', async (modelName) => {
const format = z.object({
items: z.array(z.object({
id: z.number(),
name: z.string()
}))
})
await runFormatTest(
'return a list of 2 items with ids 1 and 2, names "first" and "second"',
format,
{
items: [
{ id: 1, name: 'first' },
{ id: 2, name: 'second' }
]
},
'array_structure',
modelName
)
})
it.each(models)('should handle enums with model %s', async (modelName) => {
const format = z.object({
status: z.enum(['success', 'error', 'pending']),
message: z.string()
})
await runFormatTest(
'return status success with message "Operation completed"',
format,
{
status: 'success',
message: 'Operation completed'
},
'enum_structure',
modelName
)
})
it.each(models)('should handle optional fields with model %s', async (modelName) => {
const format = z.object({
name: z.string(),
age: z.number().optional(),
email: z.string().email().optional()
})
await runFormatTest(
'return name "John" with age 30 and email "john@example.com"',
format,
{
name: 'John',
age: 30,
email: 'john@example.com'
},
'optional_fields',
modelName
)
})
it('should generate markdown report', () => {
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
// Get only the latest result for each test+model combination
testResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate markdown report
let report = '# Format Test Results\n\n'
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${JSON.stringify(result.expected)}\`\n`
report += `- Actual: \`${JSON.stringify(result.result[0] || '')}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${JSON.stringify(result.expected)}\`\n`
report += `- Actual: \`${JSON.stringify(result.result[0] || '')}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
const reportPath = path.resolve(__dirname, './format-report.md')
write(reportPath, report)
// Verify report was written
expect(exists(reportPath) === 'file').toBe(true)
})
import { describe, it, expect } from 'vitest'
import { run } from '../../src/index'
import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { z } from 'zod'
import {
models,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse
} from './commons'
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
const TEST_SCHEMA_PATH = path.resolve(__dirname, './test-schema.json')
// Sample JSON Schema for testing
const testJsonSchema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://example.com/user-profile.schema.json",
"title": "User Profile",
"description": "A user profile containing name, age, and tags",
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "User's full name",
"minLength": 1,
"pattern": "^[A-Za-z\\s]+$"
},
"age": {
"type": "number",
"description": "User's age in years",
"minimum": 0,
"maximum": 150
},
"email": {
"type": "string",
"description": "User's email address",
"format": "email"
},
"tags": {
"type": "array",
"description": "List of user's tags",
"items": {
"type": "string",
"enum": ["developer", "designer", "manager", "admin", "user"]
},
"minItems": 1,
"maxItems": 5,
"uniqueItems": true
},
"address": {
"type": "object",
"description": "User's address",
"properties": {
"street": {
"type": "string",
"description": "Street address"
},
"city": {
"type": "string",
"description": "City name"
},
"country": {
"type": "string",
"description": "Country name",
"enum": ["US", "UK", "CA", "AU"]
},
"zipCode": {
"type": "string",
"description": "ZIP/Postal code",
"pattern": "^[0-9]{5}(-[0-9]{4})?$"
}
},
"required": ["street", "city", "country"]
},
"preferences": {
"type": "object",
"description": "User preferences",
"properties": {
"theme": {
"type": "string",
"enum": ["light", "dark", "system"],
"default": "system"
},
"notifications": {
"type": "boolean",
"default": true
},
"language": {
"type": "string",
"enum": ["en", "es", "fr", "de", "ja"],
"default": "en"
}
}
}
},
"required": ["name", "age", "email"],
"additionalProperties": false
}
// Write test schema to file
write(TEST_SCHEMA_PATH, JSON.stringify(testJsonSchema, null, 2))
// Helper function to normalize JSON strings
const normalizeJson = (json: string) => {
try {
// Remove markdown code block if present
const cleanJson = json.replace(/```json\n|\n```/g, '').trim()
return JSON.stringify(JSON.parse(cleanJson))
} catch {
return json
}
}
// Helper function to validate email
const isValidEmail = (email: string) => {
return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email)
}
// Helper function to validate number in range
const isNumberInRange = (num: number, min: number, max: number) => {
return num >= min && num <= max
}
// Helper function to validate array length
const hasValidArrayLength = (arr: any[], length: number) => {
return Array.isArray(arr) && arr.length === length && arr.every(item => typeof item === 'string')
}
describe('Format Options', () => {
let testResults: TestResult[] = []
// Load existing results if any
if (exists(TEST_LOG_PATH)) {
const data = read(TEST_LOG_PATH, 'json')
testResults = Array.isArray(data) ? data : []
}
const runFormatTest = async (prompt: string, expected: string, testName: string, modelName: string, options: any = {}) => {
let model = 'gpt-4o'
let router = 'openai'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: 'gpt-4o',
router: 'openai',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
...options,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
const actual = result?.[0]?.trim() || ''
const normalizedActual = normalizeJson(actual)
const normalizedExpected = normalizeJson(expected)
const passed = normalizedActual === normalizedExpected
expect(normalizedActual).toEqual(normalizedExpected)
testResult = {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${normalizedExpected}, but got ${normalizedActual}`,
}
} catch (e) {
error = formatError(e)
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred'
}
throw e
} finally {
if (testResult) {
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
}
// Test JSON Schema format using file path
it('should format response according to JSON Schema file', async () => {
const prompt = 'Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.'
const expected = JSON.stringify({
name: "John Doe",
age: 30,
tags: ["developer", "javascript"]
})
await runFormatTest(
prompt,
expected,
'json-schema-file-format',
'gpt-4o',
{
format: TEST_SCHEMA_PATH
}
)
}, { timeout: 10000 })
// Test JSON Schema format using schema object
it('should format response according to JSON Schema object', async () => {
const prompt = `Create a user profile with the following details:
- Name: Jane Smith
- Age: 25
- Email: jane.smith@company.com
- Tags: ["developer", "designer"]
- Address: 123 Main St, New York, US, 10001
- Preferences: light theme, notifications enabled, English language
Return only the JSON object, no explanation.`
try {
const result = await run({
prompt,
mode: 'completion',
model: 'gpt-4o',
router: 'openai',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
format: testJsonSchema
}) as string[]
const response = JSON.parse(normalizeJson(result?.[0] || '{}'))
// Validate required fields
expect(response.name).toBe('Jane Smith')
expect(response.age).toBe(25)
expect(response.email).toBe('jane.smith@company.com')
// Validate tags
expect(Array.isArray(response.tags)).toBe(true)
expect(response.tags).toContain('developer')
expect(response.tags).toContain('designer')
// Validate address
expect(response.address.street).toBe('123 Main St')
expect(response.address.city).toBe('New York')
expect(response.address.country).toBe('US')
expect(response.address.zipCode || response.address.postal_code).toMatch(/^[0-9]{5}$/)
// Validate preferences
expect(response.preferences.theme).toBe('light')
expect(['true', true, 'enabled'].includes(response.preferences.notifications)).toBe(true)
expect(['en', 'English'].includes(response.preferences.language)).toBe(true)
} catch (e) {
throw e
}
}, { timeout: 10000 })
// Test Zod Schema format with string
it('should format response according to Zod string schema', async () => {
const prompt = 'Generate a valid email address for a business domain. Return only the email, no explanation.'
try {
const result = await run({
prompt,
mode: 'completion',
model: 'gpt-4o',
router: 'openai',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
format: {
type: "object",
properties: {
email: {
type: "string",
format: "email"
}
},
required: ["email"]
}
}) as string[]
const email = result?.[0]?.trim() || ''
expect(isValidEmail(email)).toBe(true)
} catch (e) {
throw e
}
}, { timeout: 10000 })
// Test Zod Schema format with number
it('should format response according to Zod number schema', async () => {
const prompt = 'Generate a random age between 18 and 65. Return only the number, no explanation.'
try {
const result = await run({
prompt,
mode: 'completion',
model: 'gpt-4o',
router: 'openai',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
format: {
type: "object",
properties: {
age: {
type: "number",
minimum: 18,
maximum: 65
}
},
required: ["age"]
}
}) as string[]
const age = parseInt(result?.[0]?.trim() || '0', 10)
expect(isNumberInRange(age, 18, 65)).toBe(true)
} catch (e) {
throw e
}
}, { timeout: 10000 })
// Test Zod Schema format with array
it('should format response according to Zod array schema', async () => {
const prompt = 'Generate a list of 3 programming languages. Return only the array, no explanation.'
try {
const result = await run({
prompt,
mode: 'completion',
model: 'gpt-4o',
router: 'openai',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
format: {
type: "object",
properties: {
languages: {
type: "array",
items: {
type: "string"
},
minItems: 3,
maxItems: 3
}
},
required: ["languages"]
}
}) as string[]
const languages = JSON.parse(result?.[0]?.trim() || '[]')
expect(hasValidArrayLength(languages, 3)).toBe(true)
} catch (e) {
throw e
}
}, { timeout: 10000 })
// Test invalid format option
it('should handle invalid format option', async () => {
const prompt = 'Generate a random number.'
try {
await run({
prompt,
mode: 'completion',
model: 'gpt-4o',
router: 'openai',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
format: {
type: "invalid",
properties: {}
}
})
// If we get here, the format validation didn't work
throw new Error('Expected format validation to fail')
} catch (e: any) {
// The error should be about invalid format/schema
if (!e.message.match(/invalid|Invalid|schema|Schema/)) {
throw new Error(`Unexpected error: ${e.message}`)
}
}
}, { timeout: 10000 })
it('should generate markdown report', () => {
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
// Get only the latest result for each test+model combination
testResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate markdown report
let report = '# Format Test Results\n\n'
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
if (result.error.details?.message) {
report += `- Error Details: ${result.error.details.message}\n`
}
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Add summary section
report += '## Summary\n\n'
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
// Write report to file
const reportPath = path.resolve(__dirname, './format-report.md')
write(reportPath, report)
// Verify report was written
expect(exists(reportPath) === 'file').toBe(true)
})
})

View File

@ -36,9 +36,9 @@
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: ``
- Duration: 1253ms
- Duration: 1192ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:47:26 PM
- Timestamp: 4/1/2025, 11:53:55 PM
### spanish - deepseek/deepseek-chat:free
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
@ -74,9 +74,9 @@
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: ``
- Duration: 932ms
- Duration: 1193ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:47:27 PM
- Timestamp: 4/1/2025, 11:53:56 PM
### french - deepseek/deepseek-chat:free
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
@ -112,9 +112,9 @@
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: ``
- Duration: 864ms
- Duration: 968ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:47:28 PM
- Timestamp: 4/1/2025, 11:53:57 PM
## Passed Tests

View File

@ -807,5 +807,41 @@
"passed": false,
"duration": 864,
"reason": "Unknown error occurred"
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [],
"expected": "hallo",
"model": "anthropic/claude-2.0",
"router": "openrouter",
"timestamp": "2025-04-01T21:53:55.163Z",
"passed": false,
"duration": 1192,
"reason": "Unknown error occurred"
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [],
"expected": "sí",
"model": "anthropic/claude-2.0",
"router": "openrouter",
"timestamp": "2025-04-01T21:53:56.357Z",
"passed": false,
"duration": 1193,
"reason": "Unknown error occurred"
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [],
"expected": "non",
"model": "anthropic/claude-2.0",
"router": "openrouter",
"timestamp": "2025-04-01T21:53:57.326Z",
"passed": false,
"duration": 968,
"reason": "Unknown error occurred"
}
]

View File

@ -36,9 +36,9 @@
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: ``
- Duration: 1218ms
- Duration: 1992ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:46:27 PM
- Timestamp: 4/1/2025, 11:53:40 PM
### multiplication - deepseek/deepseek-chat:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
@ -74,9 +74,9 @@
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: ``
- Duration: 911ms
- Duration: 1078ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:46:27 PM
- Timestamp: 4/1/2025, 11:53:41 PM
### division - deepseek/deepseek-chat:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
@ -112,9 +112,9 @@
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: ``
- Duration: 1485ms
- Duration: 940ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:46:29 PM
- Timestamp: 4/1/2025, 11:53:42 PM
## Passed Tests

View File

@ -2213,5 +2213,41 @@
"passed": false,
"duration": 1485,
"reason": "Unknown error occurred"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [],
"expected": "8",
"model": "anthropic/claude-2.0",
"router": "openrouter",
"timestamp": "2025-04-01T21:53:40.351Z",
"passed": false,
"duration": 1992,
"reason": "Unknown error occurred"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [],
"expected": "24",
"model": "anthropic/claude-2.0",
"router": "openrouter",
"timestamp": "2025-04-01T21:53:41.431Z",
"passed": false,
"duration": 1078,
"reason": "Unknown error occurred"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [],
"expected": "5",
"model": "anthropic/claude-2.0",
"router": "openrouter",
"timestamp": "2025-04-01T21:53:42.372Z",
"passed": false,
"duration": 940,
"reason": "Unknown error occurred"
}
]

View File

@ -0,0 +1,113 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://example.com/user-profile.schema.json",
"title": "User Profile",
"description": "A user profile containing name, age, and tags",
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "User's full name",
"minLength": 1,
"pattern": "^[A-Za-z\\s]+$"
},
"age": {
"type": "number",
"description": "User's age in years",
"minimum": 0,
"maximum": 150
},
"email": {
"type": "string",
"description": "User's email address",
"format": "email"
},
"tags": {
"type": "array",
"description": "List of user's tags",
"items": {
"type": "string",
"enum": [
"developer",
"designer",
"manager",
"admin",
"user"
]
},
"minItems": 1,
"maxItems": 5,
"uniqueItems": true
},
"address": {
"type": "object",
"description": "User's address",
"properties": {
"street": {
"type": "string",
"description": "Street address"
},
"city": {
"type": "string",
"description": "City name"
},
"country": {
"type": "string",
"description": "Country name",
"enum": [
"US",
"UK",
"CA",
"AU"
]
},
"zipCode": {
"type": "string",
"description": "ZIP/Postal code",
"pattern": "^[0-9]{5}(-[0-9]{4})?$"
}
},
"required": [
"street",
"city",
"country"
]
},
"preferences": {
"type": "object",
"description": "User preferences",
"properties": {
"theme": {
"type": "string",
"enum": [
"light",
"dark",
"system"
],
"default": "system"
},
"notifications": {
"type": "boolean",
"default": true
},
"language": {
"type": "string",
"enum": [
"en",
"es",
"fr",
"de",
"ja"
],
"default": "en"
}
}
}
},
"required": [
"name",
"age",
"email"
],
"additionalProperties": false
}