tests:response_format
This commit is contained in:
parent
2eac7ecda1
commit
20cb6861af
@ -35,6 +35,7 @@ export const runCompletion = async (client, params, options) => {
|
||||
logger.info('Dry run - skipping API call');
|
||||
return false;
|
||||
}
|
||||
// await client.beta.chat.completions.parse
|
||||
const completion = await client.chat.completions.create({
|
||||
model: options.model,
|
||||
messages: params.messages,
|
||||
@ -44,4 +45,4 @@ export const runCompletion = async (client, params, options) => {
|
||||
result = await onCompletion(result, options);
|
||||
return result;
|
||||
};
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicnVuLWNvbXBsZXRpb24uanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvY29tbWFuZHMvcnVuLWNvbXBsZXRpb24udHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQ0EsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLFFBQVEsQ0FBQTtBQUMvQixPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFDaEQsT0FBTyxLQUFLLElBQUksTUFBTSxXQUFXLENBQUE7QUFDakMsT0FBTyxFQUFFLElBQUksSUFBSSxLQUFLLEVBQUUsTUFBTSxvQkFBb0IsQ0FBQTtBQUNsRCxPQUFPLEVBQUUsT0FBTyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFJM0MsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLGFBQWEsQ0FBQTtBQUNwQyxPQUFPLEVBQUUsWUFBWSxFQUFFLE1BQU0sb0JBQW9CLENBQUE7QUFDakQsT0FBTyxFQUFFLFlBQVksRUFBVSxNQUFNLGVBQWUsQ0FBQTtBQUNwRCxPQUFPLEVBQUUsU0FBUyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFHM0MsTUFBTSxDQUFDLE1BQU0sWUFBWSxHQUFHLEtBQUssRUFBRSxTQUFjLEVBQUUsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDekUsTUFBTSxHQUFHLFlBQVksQ0FBQyxNQUFNLEVBQUUsT0FBTyxDQUFDLE9BQW1CLElBQUksRUFBRSxDQUFDLENBQUE7SUFDaEUsTUFBTSxJQUFJLEdBQUcsU0FBUyxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQy9CLElBQUksT0FBTyxDQUFDLEdBQUcsRUFBRSxDQUFDO1FBQ2hCLE1BQU0sT0FBTyxHQUFHLElBQUksQ0FBQyxPQUFPLENBQUMsT0FBTyxDQUFDLE9BQU8sQ0FBQyxHQUFHLEVBQUUsS0FBSyxFQUFFO1lBQ3ZELEdBQUcsSUFBSTtZQUNQLEtBQUssRUFBRSxJQUFJLENBQUMsS0FBSyxDQUFDLE9BQU8sQ0FBQyxLQUFLLENBQUMsQ0FBQyxJQUFJO1lBQ3JDLE1BQU0sRUFBRSxPQUFPLENBQUMsTUFBTTtTQUN2QixDQUFDLENBQUMsQ0FBQTtRQUNILEtBQUssQ0FBQyxPQUFPLEVBQUUsTUFBTSxDQUFDLENBQUE7UUFDdEIsTUFBTSxDQUFDLEtBQUssQ0FBQyw4QkFBOEIsT0FBTyxNQUFNLE9BQU8sQ0FBQyxHQUFHLEVBQUUsQ0FBQyxDQUFBO0lBQ3hFLENBQUM7U0FBTSxDQUFDO1FBQ04sTUFBTSxDQUFDLEdBQUcsQ0FBQyxjQUFjLENBQUM7WUFDeEIsS0FBSyxFQUFFLEtBQUs7U0FDYixDQUFDLENBQUMsQ0FBQTtRQUNILE1BQU0sT0FBTyxHQUFXLE1BQU0sQ0FBQyxNQUFNLENBQVcsQ0FBQztRQUNqRCxPQUFPLENBQUMsTUFBTSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsQ0FBQTtJQUMvQixDQUFDO0lBQ0QsWUFBWSxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQ3JCLGtCQUFrQjtJQUNsQixPQUFPLE1BQU0sQ0FBQTtBQUNmLENBQUMsQ0FBQTtBQUVELE1BQU0sQ0FBQyxNQUFNLGFBQWEsR0FBRyxLQUFLLEVBQUUsTUFBYyxFQUFFLE1BQVcsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDckYsSUFBSSxPQUFPLENBQUMsR0FBRyxFQUFFLENBQUM7UUFDaEIsTUFBTSxDQUFDLElBQUksQ0FBQyw2QkFBNkIsQ0FBQyxDQUFBO1FBQzFDLE9BQU8sS0FBSyxDQUFBO0lBQ2QsQ0FBQztJQUNELE1BQU0sVUFBVSxHQUFHLE1BQU0sTUFBTSxDQUFDLElBQUksQ0FBQyxXQUFXLENBQUMsTUFBTSxDQUFDO1FBQ3RELEtBQUssRUFBRSxPQUFPLENBQUMsS0FBSztRQUNwQixRQUFRLEVBQUUsTUFBTSxDQUFDLFFBQVE7UUFDekIsZUFBZSxFQUFFLE9BQU8sQ0FBQyxNQUFhO0tBQ3ZDLENBQUMsQ0FBQTtJQUNGLElBQUksTUFBTSxHQUFHLFVBQVUsQ0FBQyxPQUFPLENBQUMsQ0FBQyxDQUFDLENBQUMsT0FBTyxDQUFDLE9BQU8sQ0FBQTtJQUNsRCxNQUFNLEdBQUcsTUFBTSxZQUFZLENBQUMsTUFBTSxFQUFFLE9BQU8sQ0FBQyxDQUFBO0lBQzVDLE9BQU8sTUFBTSxDQUFBO0FBQ2YsQ0FBQyxDQUFBIn0=
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicnVuLWNvbXBsZXRpb24uanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvY29tbWFuZHMvcnVuLWNvbXBsZXRpb24udHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQ0EsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLFFBQVEsQ0FBQTtBQUMvQixPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFDaEQsT0FBTyxLQUFLLElBQUksTUFBTSxXQUFXLENBQUE7QUFDakMsT0FBTyxFQUFFLElBQUksSUFBSSxLQUFLLEVBQUUsTUFBTSxvQkFBb0IsQ0FBQTtBQUNsRCxPQUFPLEVBQUUsT0FBTyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFJM0MsT0FBTyxFQUFFLE1BQU0sRUFBRSxNQUFNLGFBQWEsQ0FBQTtBQUNwQyxPQUFPLEVBQUUsWUFBWSxFQUFFLE1BQU0sb0JBQW9CLENBQUE7QUFDakQsT0FBTyxFQUFFLFlBQVksRUFBVSxNQUFNLGVBQWUsQ0FBQTtBQUNwRCxPQUFPLEVBQUUsU0FBUyxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFHM0MsTUFBTSxDQUFDLE1BQU0sWUFBWSxHQUFHLEtBQUssRUFBRSxTQUFjLEVBQUUsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDekUsTUFBTSxHQUFHLFlBQVksQ0FBQyxNQUFNLEVBQUUsT0FBTyxDQUFDLE9BQW1CLElBQUksRUFBRSxDQUFDLENBQUE7SUFDaEUsTUFBTSxJQUFJLEdBQUcsU0FBUyxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQy9CLElBQUksT0FBTyxDQUFDLEdBQUcsRUFBRSxDQUFDO1FBQ2hCLE1BQU0sT0FBTyxHQUFHLElBQUksQ0FBQyxPQUFPLENBQUMsT0FBTyxDQUFDLE9BQU8sQ0FBQyxHQUFHLEVBQUUsS0FBSyxFQUFFO1lBQ3ZELEdBQUcsSUFBSTtZQUNQLEtBQUssRUFBRSxJQUFJLENBQUMsS0FBSyxDQUFDLE9BQU8sQ0FBQyxLQUFLLENBQUMsQ0FBQyxJQUFJO1lBQ3JDLE1BQU0sRUFBRSxPQUFPLENBQUMsTUFBTTtTQUN2QixDQUFDLENBQUMsQ0FBQTtRQUNILEtBQUssQ0FBQyxPQUFPLEVBQUUsTUFBTSxDQUFDLENBQUE7UUFDdEIsTUFBTSxDQUFDLEtBQUssQ0FBQyw4QkFBOEIsT0FBTyxNQUFNLE9BQU8sQ0FBQyxHQUFHLEVBQUUsQ0FBQyxDQUFBO0lBQ3hFLENBQUM7U0FBTSxDQUFDO1FBQ04sTUFBTSxDQUFDLEdBQUcsQ0FBQyxjQUFjLENBQUM7WUFDeEIsS0FBSyxFQUFFLEtBQUs7U0FDYixDQUFDLENBQUMsQ0FBQTtRQUNILE1BQU0sT0FBTyxHQUFXLE1BQU0sQ0FBQyxNQUFNLENBQVcsQ0FBQztRQUNqRCxPQUFPLENBQUMsTUFBTSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsQ0FBQTtJQUMvQixDQUFDO0lBQ0QsWUFBWSxDQUFDLE9BQU8sQ0FBQyxDQUFBO0lBQ3JCLGtCQUFrQjtJQUNsQixPQUFPLE1BQU0sQ0FBQTtBQUNmLENBQUMsQ0FBQTtBQUVELE1BQU0sQ0FBQyxNQUFNLGFBQWEsR0FBRyxLQUFLLEVBQUUsTUFBYyxFQUFFLE1BQVcsRUFBRSxPQUFrQixFQUFFLEVBQUU7SUFDckYsSUFBSSxPQUFPLENBQUMsR0FBRyxFQUFFLENBQUM7UUFDaEIsTUFBTSxDQUFDLElBQUksQ0FBQyw2QkFBNkIsQ0FBQyxDQUFBO1FBQzFDLE9BQU8sS0FBSyxDQUFBO0lBQ2QsQ0FBQztJQUNELDJDQUEyQztJQUMzQyxNQUFNLFVBQVUsR0FBRyxNQUFNLE1BQU0sQ0FBQyxJQUFJLENBQUMsV0FBVyxDQUFDLE1BQU0sQ0FBQztRQUN0RCxLQUFLLEVBQUUsT0FBTyxDQUFDLEtBQUs7UUFDcEIsUUFBUSxFQUFFLE1BQU0sQ0FBQyxRQUFRO1FBQ3pCLGVBQWUsRUFBRSxPQUFPLENBQUMsTUFBYTtLQUN2QyxDQUFDLENBQUE7SUFDRixJQUFJLE1BQU0sR0FBRyxVQUFVLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLE9BQU8sQ0FBQyxPQUFPLENBQUE7SUFDbEQsTUFBTSxHQUFHLE1BQU0sWUFBWSxDQUFDLE1BQU0sRUFBRSxPQUFPLENBQUMsQ0FBQTtJQUM1QyxPQUFPLE1BQU0sQ0FBQTtBQUNmLENBQUMsQ0FBQSJ9
|
||||
File diff suppressed because one or more lines are too long
@ -1,9 +1,9 @@
|
||||
{
|
||||
"model": "anthropic/claude-2.0",
|
||||
"model": "gpt-4o",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "translate \"no\" to French. Return only the translated word, no explanation."
|
||||
"content": "Generate a random number."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
"test": "vitest run",
|
||||
"test:basic": "vitest run tests/unit/basic.test.ts",
|
||||
"test:math": "vitest run tests/unit/math.test.ts",
|
||||
"test:format": "vitest run tests/unit/format.test.ts",
|
||||
"test:language": "vitest run tests/unit/language.test.ts",
|
||||
"test2:watch": "vitest",
|
||||
"test2:coverage": "vitest run --coverage",
|
||||
|
||||
@ -56,8 +56,7 @@ export const models = () => {
|
||||
const openRouterPath = path.resolve(OPENROUTER_CACHE_PATH)
|
||||
if (!exists(openRouterPath)) {
|
||||
fetchOpenRouterModels()
|
||||
}
|
||||
if (exists(openRouterPath)) {
|
||||
}else{
|
||||
const modelData: OpenRouterCachedModels = read(openRouterPath, 'json') as OpenRouterCachedModels
|
||||
models.push(chalk.magenta.bold('\n OpenRouter models:\n'))
|
||||
models.push(...listOpenRouterModelsAsStrings(modelData.models))
|
||||
@ -85,8 +84,7 @@ export const all = () => {
|
||||
const openRouterPath = path.resolve(OPENROUTER_CACHE_PATH)
|
||||
if (!exists(openRouterPath)) {
|
||||
fetchOpenRouterModels()
|
||||
}
|
||||
if (exists(openRouterPath)) {
|
||||
}else{
|
||||
const modelData: OpenRouterCachedModels = read(openRouterPath, 'json') as OpenRouterCachedModels
|
||||
models = models.concat(modelData.models)
|
||||
}
|
||||
|
||||
@ -2,171 +2,56 @@
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### basic_arithmetic - deepseek/deepseek-chat:free
|
||||
- Prompt: `return the result of 2+2, dont comment`
|
||||
- Expected: `undefined`
|
||||
- Actual: `4`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:30 PM
|
||||
|
||||
### basic_arithmetic - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return the result of 2+2, dont comment`
|
||||
- Expected: `undefined`
|
||||
- Actual: `4
|
||||
`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:31 PM
|
||||
|
||||
### basic_arithmetic - gpt-4
|
||||
- Prompt: `return the result of 2+2, dont comment`
|
||||
- Expected: `undefined`
|
||||
- Actual: `4`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:32 PM
|
||||
|
||||
### json_structure - deepseek/deepseek-chat:free
|
||||
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
|
||||
- Expected: `undefined`
|
||||
- Actual: `{"name":"test","value":42}`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:33 PM
|
||||
|
||||
### json_structure - gpt-4
|
||||
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
|
||||
- Expected: `undefined`
|
||||
- Actual: `{"name": "test", "value": 42}`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:36 PM
|
||||
|
||||
### json_structure - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
|
||||
- Expected: `undefined`
|
||||
- Actual: `{
|
||||
"name": "test",
|
||||
"value": 42
|
||||
}`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:35 PM
|
||||
|
||||
### hello - deepseek/deepseek-chat:free
|
||||
- Prompt: `say "hello"`
|
||||
- Expected: `hello`
|
||||
- Actual: ``
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:36:37 PM
|
||||
|
||||
### hello - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `say "hello"`
|
||||
- Expected: `hello`
|
||||
- Actual: ``
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:36:37 PM
|
||||
|
||||
### hello - gpt-4
|
||||
- Prompt: `say "hello"`
|
||||
- Expected: `hello`
|
||||
- Actual: ``
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:36:42 PM
|
||||
|
||||
### goodbye - deepseek/deepseek-chat:free
|
||||
- Prompt: `say "goodbye"`
|
||||
- Expected: `goodbye`
|
||||
- Actual: ``
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:36:42 PM
|
||||
|
||||
### goodbye - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `say "goodbye"`
|
||||
- Expected: `goodbye`
|
||||
- Actual: ``
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:36:43 PM
|
||||
|
||||
### goodbye - gpt-4
|
||||
- Prompt: `say "goodbye"`
|
||||
- Expected: `goodbye`
|
||||
- Actual: ``
|
||||
- Reason: expected 'goodbye.' to deeply equal 'goodbye'
|
||||
- Timestamp: 4/1/2025, 1:36:44 PM
|
||||
|
||||
### yes - deepseek/deepseek-chat:free
|
||||
- Prompt: `say "yes"`
|
||||
- Expected: `yes`
|
||||
- Actual: ``
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:36:45 PM
|
||||
|
||||
### yes - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `say "yes"`
|
||||
- Expected: `yes`
|
||||
- Actual: ``
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:36:45 PM
|
||||
|
||||
### yes - gpt-4
|
||||
- Prompt: `say "yes"`
|
||||
- Expected: `yes`
|
||||
- Actual: ``
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:36:46 PM
|
||||
*No failed tests*
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### addition - deepseek/deepseek-chat:free
|
||||
### addition - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Timestamp: 4/1/2025, 12:59:06 PM
|
||||
- Duration: 1551ms
|
||||
- Timestamp: 4/2/2025, 12:17:39 AM
|
||||
|
||||
### addition - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:08 PM
|
||||
|
||||
### addition - gpt-4
|
||||
### addition - qwen/qwq-32b
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Timestamp: 4/1/2025, 1:39:04 PM
|
||||
- Duration: 3621ms
|
||||
- Timestamp: 4/2/2025, 12:17:42 AM
|
||||
|
||||
### multiplication - deepseek/deepseek-chat:free
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Timestamp: 4/1/2025, 12:59:13 PM
|
||||
- Duration: 873ms
|
||||
- Timestamp: 4/2/2025, 12:17:43 AM
|
||||
|
||||
### multiplication - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:15 PM
|
||||
|
||||
### multiplication - gpt-4
|
||||
### multiplication - qwen/qwq-32b
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Timestamp: 4/1/2025, 1:39:06 PM
|
||||
- Duration: 3472ms
|
||||
- Timestamp: 4/2/2025, 12:17:47 AM
|
||||
|
||||
### division - deepseek/deepseek-chat:free
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Timestamp: 4/1/2025, 12:59:18 PM
|
||||
- Duration: 1183ms
|
||||
- Timestamp: 4/2/2025, 12:17:48 AM
|
||||
|
||||
### division - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:56:09 PM
|
||||
|
||||
### division - gpt-4
|
||||
### division - qwen/qwq-32b
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Timestamp: 4/1/2025, 1:39:08 PM
|
||||
- Duration: 4841ms
|
||||
- Timestamp: 4/2/2025, 12:17:53 AM
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 6
|
||||
- Passed: 6
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -4,153 +4,121 @@ import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
import {
|
||||
models,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
|
||||
|
||||
describe('Basic Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
if (exists(TEST_LOG_PATH)) {
|
||||
const data = read(TEST_LOG_PATH, 'json')
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
|
||||
const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
let testResult: TestResult | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred'
|
||||
}
|
||||
throw e
|
||||
} finally {
|
||||
if (testResult) {
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
it.each(models)('should add two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'add 5 and 3. Return only the number, no explanation.'
|
||||
const expected = '8'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
if (!actual) {
|
||||
console.log(`Skipping test for model ${modelName} - no result returned`)
|
||||
return
|
||||
}
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'addition',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
await runBasicTest(
|
||||
'add 5 and 3. Return only the number, no explanation.',
|
||||
'8',
|
||||
'addition',
|
||||
modelName
|
||||
)
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'multiply 8 and 3. Return only the number, no explanation.'
|
||||
const expected = '24'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
if (!actual) {
|
||||
console.log(`Skipping test for model ${modelName} - no result returned`)
|
||||
return
|
||||
}
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'multiplication',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
await runBasicTest(
|
||||
'multiply 8 and 3. Return only the number, no explanation.',
|
||||
'24',
|
||||
'multiplication',
|
||||
modelName
|
||||
)
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it.each(models)('should divide two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'divide 15 by 3. Return only the number, no explanation.'
|
||||
const expected = '5'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
if (!actual) {
|
||||
console.log(`Skipping test for model ${modelName} - no result returned`)
|
||||
return
|
||||
}
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'division',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
await runBasicTest(
|
||||
'divide 15 by 3. Return only the number, no explanation.',
|
||||
'5',
|
||||
'division',
|
||||
modelName
|
||||
)
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
@ -173,32 +141,64 @@ describe('Basic Operations', () => {
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
if (result.error.details?.message) {
|
||||
report += `- Error Details: ${result.error.details.message}\n`
|
||||
}
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Add summary section
|
||||
report += '## Summary\n\n'
|
||||
const totalTests = testResults.length
|
||||
const passedTests = testResults.filter(r => r.passed).length
|
||||
const failedTests = totalTests - passedTests
|
||||
report += `- Total Tests: ${totalTests}\n`
|
||||
report += `- Passed: ${passedTests}\n`
|
||||
report += `- Failed: ${failedTests}\n`
|
||||
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './basic-report.md')
|
||||
|
||||
@ -3,7 +3,8 @@ import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL } from '../
|
||||
|
||||
export const models = [
|
||||
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_2_0
|
||||
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
|
||||
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B
|
||||
]
|
||||
|
||||
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
|
||||
@ -2,546 +2,224 @@
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### basic_structure - deepseek/deepseek-chat:free
|
||||
- Prompt: `return a greeting "hello" with count 42`
|
||||
- Expected: `{"greeting":"hello","count":42}`
|
||||
- Actual: `""`
|
||||
- Duration: 885ms
|
||||
- Error Type: Error
|
||||
### json-schema-file-format - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
|
||||
- Actual: ``
|
||||
- Duration: 1690ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:36 PM
|
||||
- Error Message: expected '{\n "name": "John Doe",\n "age": 30…' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
|
||||
- Error Details: expected '{\n "name": "John Doe",\n "age": 30…' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
|
||||
- Reason: expected '{\n "name": "John Doe",\n "age": 30…' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
|
||||
- Timestamp: 4/2/2025, 12:23:43 AM
|
||||
|
||||
### basic_structure - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return a greeting "hello" with count 42`
|
||||
- Expected: `{"greeting":"hello","count":42}`
|
||||
- Actual: `""`
|
||||
- Duration: 757ms
|
||||
- Error Type: Error
|
||||
### json-schema-file-format - qwen/qwq-32b
|
||||
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
|
||||
- Actual: ``
|
||||
- Duration: 3426ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:36 PM
|
||||
- Error Message: expected '"John Doe",\n "age": 30,\n "tags": …' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
|
||||
- Error Details: expected '"John Doe",\n "age": 30,\n "tags": …' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
|
||||
- Reason: expected '"John Doe",\n "age": 30,\n "tags": …' to deeply equal '{"name":"John Doe","age":30,"tags":["…'
|
||||
- Timestamp: 4/2/2025, 12:23:47 AM
|
||||
|
||||
### basic_structure - gpt-4
|
||||
- Prompt: `return a greeting "hello" with count 42`
|
||||
- Expected: `{"greeting":"hello","count":42}`
|
||||
- Actual: `""`
|
||||
- Duration: 1043ms
|
||||
- Error Type: Error
|
||||
### json-schema-object-format - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `Create a user profile with name Jane Smith, age 25, and tags ["designer", "ui"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"Jane Smith","age":25,"tags":["designer","ui"]}`
|
||||
- Actual: ``
|
||||
- Duration: 1918ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:37 PM
|
||||
- Error Message: expected '{\n "name": "Jane Smith",\n "age": …' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
|
||||
- Error Details: expected '{\n "name": "Jane Smith",\n "age": …' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
|
||||
- Reason: expected '{\n "name": "Jane Smith",\n "age": …' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
|
||||
- Timestamp: 4/2/2025, 12:23:49 AM
|
||||
|
||||
### basic_structure - anthropic/claude-3.7-sonnet
|
||||
- Prompt: `return a greeting "hello" with count 42`
|
||||
- Expected: `{"greeting":"hello","count":42}`
|
||||
- Actual: `""`
|
||||
- Duration: 1790ms
|
||||
- Error Type: Error
|
||||
### json-schema-object-format - qwen/qwq-32b
|
||||
- Prompt: `Create a user profile with name Jane Smith, age 25, and tags ["designer", "ui"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"Jane Smith","age":25,"tags":["designer","ui"]}`
|
||||
- Actual: ``
|
||||
- Duration: 9789ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: Unexpected token 'h', "hello 42" is not valid JSON
|
||||
- Reason: Failed to parse or validate response: Unexpected token 'h', "hello 42" is not valid JSON
|
||||
- Timestamp: 4/1/2025, 1:23:05 PM
|
||||
- Error Message: expected '{"name": "Jane Smith", "age": 25, "ta…' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
|
||||
- Error Details: expected '{"name": "Jane Smith", "age": 25, "ta…' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
|
||||
- Reason: expected '{"name": "Jane Smith", "age": 25, "ta…' to deeply equal '{"name":"Jane Smith","age":25,"tags":…'
|
||||
- Timestamp: 4/2/2025, 12:23:58 AM
|
||||
|
||||
### basic_structure - openai/gpt-4
|
||||
- Prompt: `Return a JSON object with a greeting "hello" and count 42. The response must be valid JSON with exactly these fields: { "greeting": string, "count": number }`
|
||||
- Expected: `{"greeting":"hello","count":42}`
|
||||
- Actual: `""`
|
||||
- Duration: 1258ms
|
||||
- Error Type: Error
|
||||
### json-schema-object-format - gpt-4o
|
||||
- Prompt: `Create a user profile with the following details:
|
||||
- Name: Jane Smith
|
||||
- Age: 25
|
||||
- Email: jane.smith@company.com
|
||||
- Tags: ["developer", "designer"]
|
||||
- Address: 123 Main St, New York, US, 10001
|
||||
- Preferences: light theme, notifications enabled, English language
|
||||
Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"Jane Smith","age":25,"email":"jane.smith@company.com","tags":["developer","designer"],"address":{"street":"123 Main St","city":"New York","country":"US","zipCode":"10001"},"preferences":{"theme":"light","notifications":true,"language":"en"}}`
|
||||
- Actual: ``
|
||||
- Duration: 2618ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Invalid response from API
|
||||
- Reason: Invalid response from API
|
||||
- Timestamp: 4/1/2025, 1:32:43 PM
|
||||
- Error Message: expected '{"name":"Jane Smith","age":25,"email"…' to deeply equal '{"name":"Jane Smith","age":25,"email"…'
|
||||
- Error Details: expected '{"name":"Jane Smith","age":25,"email"…' to deeply equal '{"name":"Jane Smith","age":25,"email"…'
|
||||
- Reason: expected '{"name":"Jane Smith","age":25,"email"…' to deeply equal '{"name":"Jane Smith","age":25,"email"…'
|
||||
- Timestamp: 4/2/2025, 12:33:08 AM
|
||||
|
||||
### nested_structure - deepseek/deepseek-chat:free
|
||||
- Prompt: `return user John age 30 with dark theme and notifications enabled`
|
||||
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
|
||||
- Actual: `""`
|
||||
- Duration: 655ms
|
||||
- Error Type: Error
|
||||
### zod-string-format - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `Generate a valid email address for a business domain. Return only the email, no explanation.`
|
||||
- Expected: `john.doe@company.com`
|
||||
- Actual: ``
|
||||
- Duration: 1347ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:38 PM
|
||||
- Error Message: expected 'sales@companyplus.com' to deeply equal 'john.doe@company.com'
|
||||
- Error Details: expected 'sales@companyplus.com' to deeply equal 'john.doe@company.com'
|
||||
- Reason: expected 'sales@companyplus.com' to deeply equal 'john.doe@company.com'
|
||||
- Timestamp: 4/2/2025, 12:24:00 AM
|
||||
|
||||
### nested_structure - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return user John age 30 with dark theme and notifications enabled`
|
||||
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
|
||||
- Actual: `""`
|
||||
- Duration: 790ms
|
||||
- Error Type: Error
|
||||
### zod-string-format - qwen/qwq-32b
|
||||
- Prompt: `Generate a valid email address for a business domain. Return only the email, no explanation.`
|
||||
- Expected: `john.doe@company.com`
|
||||
- Actual: ``
|
||||
- Duration: 13704ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:39 PM
|
||||
- Error Message: expected 'info@techstart.com' to deeply equal 'john.doe@company.com'
|
||||
- Error Details: expected 'info@techstart.com' to deeply equal 'john.doe@company.com'
|
||||
- Reason: expected 'info@techstart.com' to deeply equal 'john.doe@company.com'
|
||||
- Timestamp: 4/2/2025, 12:24:13 AM
|
||||
|
||||
### nested_structure - gpt-4
|
||||
- Prompt: `return user John age 30 with dark theme and notifications enabled`
|
||||
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
|
||||
- Actual: `""`
|
||||
- Duration: 717ms
|
||||
- Error Type: Error
|
||||
### zod-string-format - gpt-4o
|
||||
- Prompt: `Generate a valid email address for a business domain. Return only the email, no explanation.`
|
||||
- Expected: `john.doe@company.com`
|
||||
- Actual: ``
|
||||
- Duration: 1794ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:40 PM
|
||||
- Error Message: expected 'contact@businessdomain.com' to deeply equal 'john.doe@company.com'
|
||||
- Error Details: expected 'contact@businessdomain.com' to deeply equal 'john.doe@company.com'
|
||||
- Reason: expected 'contact@businessdomain.com' to deeply equal 'john.doe@company.com'
|
||||
- Timestamp: 4/2/2025, 12:32:20 AM
|
||||
|
||||
### nested_structure - anthropic/claude-3.7-sonnet
|
||||
- Prompt: `return user John age 30 with dark theme and notifications enabled`
|
||||
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
|
||||
- Actual: `""`
|
||||
- Duration: 1189ms
|
||||
- Error Type: Error
|
||||
### zod-number-format - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `Generate a random age between 18 and 65. Return only the number, no explanation.`
|
||||
- Expected: `25`
|
||||
- Actual: ``
|
||||
- Duration: 1376ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: Unexpected token '#', "# John's U"... is not valid JSON
|
||||
- Reason: Failed to parse or validate response: Unexpected token '#', "# John's U"... is not valid JSON
|
||||
- Timestamp: 4/1/2025, 1:23:06 PM
|
||||
- Error Message: expected '42' to deeply equal '25'
|
||||
- Error Details: expected '42' to deeply equal '25'
|
||||
- Reason: expected '42' to deeply equal '25'
|
||||
- Timestamp: 4/2/2025, 12:24:11 AM
|
||||
|
||||
### nested_structure - openai/gpt-4
|
||||
- Prompt: `Return a JSON object with user John age 30, dark theme and notifications enabled. The response must be valid JSON with this structure: { "user": { "name": string, "age": number }, "settings": { "theme": string, "notifications": boolean } }`
|
||||
- Expected: `{"user":{"name":"John","age":30},"settings":{"theme":"dark","notifications":true}}`
|
||||
- Actual: `""`
|
||||
- Duration: 716ms
|
||||
- Error Type: Error
|
||||
### zod-number-format - gpt-4o
|
||||
- Prompt: `Generate a random age between 18 and 65. Return only the number, no explanation.`
|
||||
- Expected: `25`
|
||||
- Actual: ``
|
||||
- Duration: 2399ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Invalid response from API
|
||||
- Reason: Invalid response from API
|
||||
- Timestamp: 4/1/2025, 1:32:44 PM
|
||||
- Error Message: expected '39' to deeply equal '25'
|
||||
- Error Details: expected '39' to deeply equal '25'
|
||||
- Reason: expected '39' to deeply equal '25'
|
||||
- Timestamp: 4/2/2025, 12:32:23 AM
|
||||
|
||||
### array_structure - deepseek/deepseek-chat:free
|
||||
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
|
||||
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
|
||||
- Actual: `""`
|
||||
- Duration: 617ms
|
||||
- Error Type: Error
|
||||
### zod-array-format - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `Generate a list of 3 programming languages. Return only the array, no explanation.`
|
||||
- Expected: `["JavaScript","Python","Java"]`
|
||||
- Actual: ``
|
||||
- Duration: 1009ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:40 PM
|
||||
- Error Message: expected '["Python", "Java", "JavaScript"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Error Details: expected '["Python", "Java", "JavaScript"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Reason: expected '["Python", "Java", "JavaScript"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Timestamp: 4/2/2025, 12:24:22 AM
|
||||
|
||||
### array_structure - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
|
||||
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
|
||||
- Actual: `""`
|
||||
- Duration: 756ms
|
||||
- Error Type: Error
|
||||
### zod-array-format - qwen/qwq-32b
|
||||
- Prompt: `Generate a list of 3 programming languages. Return only the array, no explanation.`
|
||||
- Expected: `["JavaScript","Python","Java"]`
|
||||
- Actual: ``
|
||||
- Duration: 4147ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:41 PM
|
||||
- Error Message: expected '["Python", "JavaScript", "Java"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Error Details: expected '["Python", "JavaScript", "Java"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Reason: expected '["Python", "JavaScript", "Java"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Timestamp: 4/2/2025, 12:24:26 AM
|
||||
|
||||
### array_structure - gpt-4
|
||||
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
|
||||
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
|
||||
- Actual: `""`
|
||||
### zod-array-format - gpt-4o
|
||||
- Prompt: `Generate a list of 3 programming languages. Return only the array, no explanation.`
|
||||
- Expected: `["JavaScript","Python","Java"]`
|
||||
- Actual: ``
|
||||
- Duration: 693ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: expected '["Python","JavaScript","Java"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Error Details: expected '["Python","JavaScript","Java"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Reason: expected '["Python","JavaScript","Java"]' to deeply equal '["JavaScript","Python","Java"]'
|
||||
- Timestamp: 4/2/2025, 12:32:23 AM
|
||||
|
||||
### invalid-format - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `Generate a random number.`
|
||||
- Expected: `Invalid format option`
|
||||
- Actual: ``
|
||||
- Duration: 1026ms
|
||||
- Error Type: Error
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:42 PM
|
||||
- Error Message: expected '73' to deeply equal 'Invalid format option'
|
||||
- Error Details: expected '73' to deeply equal 'Invalid format option'
|
||||
- Reason: expected '73' to deeply equal 'Invalid format option'
|
||||
- Timestamp: 4/2/2025, 12:24:27 AM
|
||||
|
||||
### array_structure - anthropic/claude-3.7-sonnet
|
||||
- Prompt: `return a list of 2 items with ids 1 and 2, names "first" and "second"`
|
||||
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
|
||||
- Actual: `""`
|
||||
- Duration: 1190ms
|
||||
- Error Type: Error
|
||||
### invalid-format - qwen/qwq-32b
|
||||
- Prompt: `Generate a random number.`
|
||||
- Expected: `Invalid format option`
|
||||
- Actual: ``
|
||||
- Duration: 7614ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:23:08 PM
|
||||
- Error Message: expected '72' to deeply equal 'Invalid format option'
|
||||
- Error Details: expected '72' to deeply equal 'Invalid format option'
|
||||
- Reason: expected '72' to deeply equal 'Invalid format option'
|
||||
- Timestamp: 4/2/2025, 12:24:35 AM
|
||||
|
||||
### array_structure - openai/gpt-4
|
||||
- Prompt: `Return a JSON object with a list of 2 items. The response must be valid JSON with this structure: { "items": [{ "id": number, "name": string }] }. The first item should have id 1 and name "first", the second item should have id 2 and name "second".`
|
||||
- Expected: `{"items":[{"id":1,"name":"first"},{"id":2,"name":"second"}]}`
|
||||
- Actual: `""`
|
||||
- Duration: 703ms
|
||||
- Error Type: Error
|
||||
### invalid-format - gpt-4o
|
||||
- Prompt: `Generate a random number.`
|
||||
- Expected: `Invalid format option`
|
||||
- Actual: ``
|
||||
- Duration: 826ms
|
||||
- Error Type: AssertionError
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Invalid response from API
|
||||
- Reason: Invalid response from API
|
||||
- Timestamp: 4/1/2025, 1:32:44 PM
|
||||
|
||||
### enum_structure - deepseek/deepseek-chat:free
|
||||
- Prompt: `return status success with message "Operation completed"`
|
||||
- Expected: `{"status":"success","message":"Operation completed"}`
|
||||
- Actual: `""`
|
||||
- Duration: 647ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:43 PM
|
||||
|
||||
### enum_structure - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return status success with message "Operation completed"`
|
||||
- Expected: `{"status":"success","message":"Operation completed"}`
|
||||
- Actual: `""`
|
||||
- Duration: 813ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:43 PM
|
||||
|
||||
### enum_structure - gpt-4
|
||||
- Prompt: `return status success with message "Operation completed"`
|
||||
- Expected: `{"status":"success","message":"Operation completed"}`
|
||||
- Actual: `""`
|
||||
- Duration: 1138ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:45 PM
|
||||
|
||||
### enum_structure - anthropic/claude-3.7-sonnet
|
||||
- Prompt: `return status success with message "Operation completed"`
|
||||
- Expected: `{"status":"success","message":"Operation completed"}`
|
||||
- Actual: `""`
|
||||
- Duration: 1728ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: Unexpected token '`', "```json
|
||||
{
|
||||
"... is not valid JSON
|
||||
- Reason: Failed to parse or validate response: Unexpected token '`', "```json
|
||||
{
|
||||
"... is not valid JSON
|
||||
- Timestamp: 4/1/2025, 1:23:09 PM
|
||||
|
||||
### enum_structure - openai/gpt-4
|
||||
- Prompt: `Return a JSON object with status "success" and message "Operation completed". The response must be valid JSON with this structure: { "status": "success" | "error" | "pending", "message": string }`
|
||||
- Expected: `{"status":"success","message":"Operation completed"}`
|
||||
- Actual: `""`
|
||||
- Duration: 688ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Invalid response from API
|
||||
- Reason: Invalid response from API
|
||||
- Timestamp: 4/1/2025, 1:32:45 PM
|
||||
|
||||
### optional_fields - deepseek/deepseek-chat:free
|
||||
- Prompt: `return name "John" with age 30 and email "john@example.com"`
|
||||
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
|
||||
- Actual: `""`
|
||||
- Duration: 676ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:45 PM
|
||||
|
||||
### optional_fields - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return name "John" with age 30 and email "john@example.com"`
|
||||
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
|
||||
- Actual: `""`
|
||||
- Duration: 884ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:46 PM
|
||||
|
||||
### optional_fields - gpt-4
|
||||
- Prompt: `return name "John" with age 30 and email "john@example.com"`
|
||||
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
|
||||
- Actual: `""`
|
||||
- Duration: 669ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Reason: Failed to parse or validate response: [
|
||||
{
|
||||
"code": "invalid_type",
|
||||
"expected": "object",
|
||||
"received": "null",
|
||||
"path": [],
|
||||
"message": "Expected object, received null"
|
||||
}
|
||||
]
|
||||
- Timestamp: 4/1/2025, 1:21:47 PM
|
||||
|
||||
### optional_fields - anthropic/claude-3.7-sonnet
|
||||
- Prompt: `return name "John" with age 30 and email "john@example.com"`
|
||||
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
|
||||
- Actual: `""`
|
||||
- Duration: 1576ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Failed to parse or validate response: Unexpected token '`', "```json
|
||||
{
|
||||
"... is not valid JSON
|
||||
- Reason: Failed to parse or validate response: Unexpected token '`', "```json
|
||||
{
|
||||
"... is not valid JSON
|
||||
- Timestamp: 4/1/2025, 1:23:11 PM
|
||||
|
||||
### optional_fields - openai/gpt-4
|
||||
- Prompt: `Return a JSON object with name "John", age 30, and email "john@example.com". The response must be valid JSON with this structure: { "name": string, "age"?: number, "email"?: string }`
|
||||
- Expected: `{"name":"John","age":30,"email":"john@example.com"}`
|
||||
- Actual: `""`
|
||||
- Duration: 682ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Invalid response from API
|
||||
- Reason: Invalid response from API
|
||||
- Timestamp: 4/1/2025, 1:32:46 PM
|
||||
- Error Message: expected '786984' to deeply equal 'Invalid format option'
|
||||
- Error Details: expected '786984' to deeply equal 'Invalid format option'
|
||||
- Reason: expected '786984' to deeply equal 'Invalid format option'
|
||||
- Timestamp: 4/2/2025, 12:32:24 AM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
*No passed tests*
|
||||
### json-schema-file-format - gpt-4o
|
||||
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
|
||||
- Actual: ````json
|
||||
{
|
||||
"name": "John Doe",
|
||||
"age": 30,
|
||||
"tags": ["developer", "javascript"]
|
||||
}
|
||||
````
|
||||
- Duration: 995ms
|
||||
- Timestamp: 4/2/2025, 12:34:28 AM
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 38
|
||||
- Passed: 5
|
||||
- Failed: 33
|
||||
- Success Rate: 13.16%
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,295 +1,508 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { run } from '../../src/index'
|
||||
import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { z } from 'zod'
|
||||
|
||||
|
||||
import {
|
||||
models_premium as models,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse,
|
||||
getRouterForModel,
|
||||
getApiKeyForRouter
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
|
||||
|
||||
describe('Structured Output Format', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
if (exists(TEST_LOG_PATH)) {
|
||||
const data = read(TEST_LOG_PATH, 'json')
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
|
||||
const runFormatTest = async (prompt: string, format: z.ZodType<any>, expected: any, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
format,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as any[]
|
||||
|
||||
const actual = result?.[0]
|
||||
let parsed: any
|
||||
|
||||
try {
|
||||
parsed = typeof actual === 'string' ? JSON.parse(actual) : actual
|
||||
// Validate against the format schema
|
||||
parsed = format.parse(parsed)
|
||||
} catch (parseError) {
|
||||
throw new Error(`Failed to parse or validate response: ${parseError.message}`)
|
||||
}
|
||||
|
||||
const passed = JSON.stringify(parsed) === JSON.stringify(expected) && !isEmptyResponse(result)
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
|
||||
expect(parsed).toEqual(expected)
|
||||
|
||||
return {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${JSON.stringify(expected)}, but got ${JSON.stringify(parsed)}`,
|
||||
config: {
|
||||
router: getRouterForModel(modelName)
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
throw e
|
||||
} finally {
|
||||
const testResult: TestResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred',
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
|
||||
it.each(models)('should return basic structured output with model %s', async (modelName) => {
|
||||
const format = z.object({
|
||||
greeting: z.string(),
|
||||
count: z.number()
|
||||
})
|
||||
|
||||
await runFormatTest(
|
||||
'return a greeting "hello" with count 42',
|
||||
format,
|
||||
{ greeting: 'hello', count: 42 },
|
||||
'basic_structure',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should handle nested structures with model %s', async (modelName) => {
|
||||
const format = z.object({
|
||||
user: z.object({
|
||||
name: z.string(),
|
||||
age: z.number()
|
||||
}),
|
||||
settings: z.object({
|
||||
theme: z.string(),
|
||||
notifications: z.boolean()
|
||||
})
|
||||
})
|
||||
|
||||
await runFormatTest(
|
||||
'return user John age 30 with dark theme and notifications enabled',
|
||||
format,
|
||||
{
|
||||
user: { name: 'John', age: 30 },
|
||||
settings: { theme: 'dark', notifications: true }
|
||||
},
|
||||
'nested_structure',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should handle arrays with model %s', async (modelName) => {
|
||||
const format = z.object({
|
||||
items: z.array(z.object({
|
||||
id: z.number(),
|
||||
name: z.string()
|
||||
}))
|
||||
})
|
||||
|
||||
await runFormatTest(
|
||||
'return a list of 2 items with ids 1 and 2, names "first" and "second"',
|
||||
format,
|
||||
{
|
||||
items: [
|
||||
{ id: 1, name: 'first' },
|
||||
{ id: 2, name: 'second' }
|
||||
]
|
||||
},
|
||||
'array_structure',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should handle enums with model %s', async (modelName) => {
|
||||
const format = z.object({
|
||||
status: z.enum(['success', 'error', 'pending']),
|
||||
message: z.string()
|
||||
})
|
||||
|
||||
await runFormatTest(
|
||||
'return status success with message "Operation completed"',
|
||||
format,
|
||||
{
|
||||
status: 'success',
|
||||
message: 'Operation completed'
|
||||
},
|
||||
'enum_structure',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should handle optional fields with model %s', async (modelName) => {
|
||||
const format = z.object({
|
||||
name: z.string(),
|
||||
age: z.number().optional(),
|
||||
email: z.string().email().optional()
|
||||
})
|
||||
|
||||
await runFormatTest(
|
||||
'return name "John" with age 30 and email "john@example.com"',
|
||||
format,
|
||||
{
|
||||
name: 'John',
|
||||
age: 30,
|
||||
email: 'john@example.com'
|
||||
},
|
||||
'optional_fields',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Format Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${JSON.stringify(result.expected)}\`\n`
|
||||
report += `- Actual: \`${JSON.stringify(result.result[0] || '')}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${JSON.stringify(result.expected)}\`\n`
|
||||
report += `- Actual: \`${JSON.stringify(result.result[0] || '')}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './format-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { run } from '../../src/index'
|
||||
import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { z } from 'zod'
|
||||
import {
|
||||
models,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
|
||||
const TEST_SCHEMA_PATH = path.resolve(__dirname, './test-schema.json')
|
||||
|
||||
// Sample JSON Schema for testing
|
||||
const testJsonSchema = {
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"$id": "https://example.com/user-profile.schema.json",
|
||||
"title": "User Profile",
|
||||
"description": "A user profile containing name, age, and tags",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "User's full name",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Za-z\\s]+$"
|
||||
},
|
||||
"age": {
|
||||
"type": "number",
|
||||
"description": "User's age in years",
|
||||
"minimum": 0,
|
||||
"maximum": 150
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"description": "User's email address",
|
||||
"format": "email"
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"description": "List of user's tags",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["developer", "designer", "manager", "admin", "user"]
|
||||
},
|
||||
"minItems": 1,
|
||||
"maxItems": 5,
|
||||
"uniqueItems": true
|
||||
},
|
||||
"address": {
|
||||
"type": "object",
|
||||
"description": "User's address",
|
||||
"properties": {
|
||||
"street": {
|
||||
"type": "string",
|
||||
"description": "Street address"
|
||||
},
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "City name"
|
||||
},
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "Country name",
|
||||
"enum": ["US", "UK", "CA", "AU"]
|
||||
},
|
||||
"zipCode": {
|
||||
"type": "string",
|
||||
"description": "ZIP/Postal code",
|
||||
"pattern": "^[0-9]{5}(-[0-9]{4})?$"
|
||||
}
|
||||
},
|
||||
"required": ["street", "city", "country"]
|
||||
},
|
||||
"preferences": {
|
||||
"type": "object",
|
||||
"description": "User preferences",
|
||||
"properties": {
|
||||
"theme": {
|
||||
"type": "string",
|
||||
"enum": ["light", "dark", "system"],
|
||||
"default": "system"
|
||||
},
|
||||
"notifications": {
|
||||
"type": "boolean",
|
||||
"default": true
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"enum": ["en", "es", "fr", "de", "ja"],
|
||||
"default": "en"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["name", "age", "email"],
|
||||
"additionalProperties": false
|
||||
}
|
||||
|
||||
// Write test schema to file
|
||||
write(TEST_SCHEMA_PATH, JSON.stringify(testJsonSchema, null, 2))
|
||||
|
||||
// Helper function to normalize JSON strings
|
||||
const normalizeJson = (json: string) => {
|
||||
try {
|
||||
// Remove markdown code block if present
|
||||
const cleanJson = json.replace(/```json\n|\n```/g, '').trim()
|
||||
return JSON.stringify(JSON.parse(cleanJson))
|
||||
} catch {
|
||||
return json
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to validate email
|
||||
const isValidEmail = (email: string) => {
|
||||
return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email)
|
||||
}
|
||||
|
||||
// Helper function to validate number in range
|
||||
const isNumberInRange = (num: number, min: number, max: number) => {
|
||||
return num >= min && num <= max
|
||||
}
|
||||
|
||||
// Helper function to validate array length
|
||||
const hasValidArrayLength = (arr: any[], length: number) => {
|
||||
return Array.isArray(arr) && arr.length === length && arr.every(item => typeof item === 'string')
|
||||
}
|
||||
|
||||
describe('Format Options', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
if (exists(TEST_LOG_PATH)) {
|
||||
const data = read(TEST_LOG_PATH, 'json')
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
|
||||
const runFormatTest = async (prompt: string, expected: string, testName: string, modelName: string, options: any = {}) => {
|
||||
let model = 'gpt-4o'
|
||||
let router = 'openai'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
let testResult: TestResult | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: 'gpt-4o',
|
||||
router: 'openai',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
...options,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const normalizedActual = normalizeJson(actual)
|
||||
const normalizedExpected = normalizeJson(expected)
|
||||
const passed = normalizedActual === normalizedExpected
|
||||
|
||||
expect(normalizedActual).toEqual(normalizedExpected)
|
||||
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${normalizedExpected}, but got ${normalizedActual}`,
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred'
|
||||
}
|
||||
throw e
|
||||
} finally {
|
||||
if (testResult) {
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test JSON Schema format using file path
|
||||
it('should format response according to JSON Schema file', async () => {
|
||||
const prompt = 'Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.'
|
||||
const expected = JSON.stringify({
|
||||
name: "John Doe",
|
||||
age: 30,
|
||||
tags: ["developer", "javascript"]
|
||||
})
|
||||
|
||||
await runFormatTest(
|
||||
prompt,
|
||||
expected,
|
||||
'json-schema-file-format',
|
||||
'gpt-4o',
|
||||
{
|
||||
format: TEST_SCHEMA_PATH
|
||||
}
|
||||
)
|
||||
}, { timeout: 10000 })
|
||||
|
||||
// Test JSON Schema format using schema object
|
||||
it('should format response according to JSON Schema object', async () => {
|
||||
const prompt = `Create a user profile with the following details:
|
||||
- Name: Jane Smith
|
||||
- Age: 25
|
||||
- Email: jane.smith@company.com
|
||||
- Tags: ["developer", "designer"]
|
||||
- Address: 123 Main St, New York, US, 10001
|
||||
- Preferences: light theme, notifications enabled, English language
|
||||
Return only the JSON object, no explanation.`
|
||||
|
||||
try {
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: 'gpt-4o',
|
||||
router: 'openai',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
format: testJsonSchema
|
||||
}) as string[]
|
||||
|
||||
const response = JSON.parse(normalizeJson(result?.[0] || '{}'))
|
||||
|
||||
// Validate required fields
|
||||
expect(response.name).toBe('Jane Smith')
|
||||
expect(response.age).toBe(25)
|
||||
expect(response.email).toBe('jane.smith@company.com')
|
||||
|
||||
// Validate tags
|
||||
expect(Array.isArray(response.tags)).toBe(true)
|
||||
expect(response.tags).toContain('developer')
|
||||
expect(response.tags).toContain('designer')
|
||||
|
||||
// Validate address
|
||||
expect(response.address.street).toBe('123 Main St')
|
||||
expect(response.address.city).toBe('New York')
|
||||
expect(response.address.country).toBe('US')
|
||||
expect(response.address.zipCode || response.address.postal_code).toMatch(/^[0-9]{5}$/)
|
||||
|
||||
// Validate preferences
|
||||
expect(response.preferences.theme).toBe('light')
|
||||
expect(['true', true, 'enabled'].includes(response.preferences.notifications)).toBe(true)
|
||||
expect(['en', 'English'].includes(response.preferences.language)).toBe(true)
|
||||
} catch (e) {
|
||||
throw e
|
||||
}
|
||||
}, { timeout: 10000 })
|
||||
|
||||
// Test Zod Schema format with string
|
||||
it('should format response according to Zod string schema', async () => {
|
||||
const prompt = 'Generate a valid email address for a business domain. Return only the email, no explanation.'
|
||||
|
||||
try {
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: 'gpt-4o',
|
||||
router: 'openai',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
email: {
|
||||
type: "string",
|
||||
format: "email"
|
||||
}
|
||||
},
|
||||
required: ["email"]
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const email = result?.[0]?.trim() || ''
|
||||
expect(isValidEmail(email)).toBe(true)
|
||||
} catch (e) {
|
||||
throw e
|
||||
}
|
||||
}, { timeout: 10000 })
|
||||
|
||||
// Test Zod Schema format with number
|
||||
it('should format response according to Zod number schema', async () => {
|
||||
const prompt = 'Generate a random age between 18 and 65. Return only the number, no explanation.'
|
||||
|
||||
try {
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: 'gpt-4o',
|
||||
router: 'openai',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
age: {
|
||||
type: "number",
|
||||
minimum: 18,
|
||||
maximum: 65
|
||||
}
|
||||
},
|
||||
required: ["age"]
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const age = parseInt(result?.[0]?.trim() || '0', 10)
|
||||
expect(isNumberInRange(age, 18, 65)).toBe(true)
|
||||
} catch (e) {
|
||||
throw e
|
||||
}
|
||||
}, { timeout: 10000 })
|
||||
|
||||
// Test Zod Schema format with array
|
||||
it('should format response according to Zod array schema', async () => {
|
||||
const prompt = 'Generate a list of 3 programming languages. Return only the array, no explanation.'
|
||||
|
||||
try {
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: 'gpt-4o',
|
||||
router: 'openai',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
languages: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
},
|
||||
minItems: 3,
|
||||
maxItems: 3
|
||||
}
|
||||
},
|
||||
required: ["languages"]
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const languages = JSON.parse(result?.[0]?.trim() || '[]')
|
||||
expect(hasValidArrayLength(languages, 3)).toBe(true)
|
||||
} catch (e) {
|
||||
throw e
|
||||
}
|
||||
}, { timeout: 10000 })
|
||||
|
||||
// Test invalid format option
|
||||
it('should handle invalid format option', async () => {
|
||||
const prompt = 'Generate a random number.'
|
||||
|
||||
try {
|
||||
await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: 'gpt-4o',
|
||||
router: 'openai',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
format: {
|
||||
type: "invalid",
|
||||
properties: {}
|
||||
}
|
||||
})
|
||||
// If we get here, the format validation didn't work
|
||||
throw new Error('Expected format validation to fail')
|
||||
} catch (e: any) {
|
||||
// The error should be about invalid format/schema
|
||||
if (!e.message.match(/invalid|Invalid|schema|Schema/)) {
|
||||
throw new Error(`Unexpected error: ${e.message}`)
|
||||
}
|
||||
}
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Format Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
if (result.error.details?.message) {
|
||||
report += `- Error Details: ${result.error.details.message}\n`
|
||||
}
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Add summary section
|
||||
report += '## Summary\n\n'
|
||||
const totalTests = testResults.length
|
||||
const passedTests = testResults.filter(r => r.passed).length
|
||||
const failedTests = totalTests - passedTests
|
||||
report += `- Total Tests: ${totalTests}\n`
|
||||
report += `- Passed: ${passedTests}\n`
|
||||
report += `- Failed: ${failedTests}\n`
|
||||
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './format-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -36,9 +36,9 @@
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 1253ms
|
||||
- Duration: 1192ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:47:26 PM
|
||||
- Timestamp: 4/1/2025, 11:53:55 PM
|
||||
|
||||
### spanish - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
@ -74,9 +74,9 @@
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 932ms
|
||||
- Duration: 1193ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:47:27 PM
|
||||
- Timestamp: 4/1/2025, 11:53:56 PM
|
||||
|
||||
### french - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
@ -112,9 +112,9 @@
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 864ms
|
||||
- Duration: 968ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:47:28 PM
|
||||
- Timestamp: 4/1/2025, 11:53:57 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
|
||||
@ -807,5 +807,41 @@
|
||||
"passed": false,
|
||||
"duration": 864,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:55.163Z",
|
||||
"passed": false,
|
||||
"duration": 1192,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:56.357Z",
|
||||
"passed": false,
|
||||
"duration": 1193,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:57.326Z",
|
||||
"passed": false,
|
||||
"duration": 968,
|
||||
"reason": "Unknown error occurred"
|
||||
}
|
||||
]
|
||||
@ -36,9 +36,9 @@
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 1218ms
|
||||
- Duration: 1992ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:46:27 PM
|
||||
- Timestamp: 4/1/2025, 11:53:40 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-chat:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
@ -74,9 +74,9 @@
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: ``
|
||||
- Duration: 911ms
|
||||
- Duration: 1078ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:46:27 PM
|
||||
- Timestamp: 4/1/2025, 11:53:41 PM
|
||||
|
||||
### division - deepseek/deepseek-chat:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
@ -112,9 +112,9 @@
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: ``
|
||||
- Duration: 1485ms
|
||||
- Duration: 940ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:46:29 PM
|
||||
- Timestamp: 4/1/2025, 11:53:42 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
|
||||
@ -2213,5 +2213,41 @@
|
||||
"passed": false,
|
||||
"duration": 1485,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:40.351Z",
|
||||
"passed": false,
|
||||
"duration": 1992,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:41.431Z",
|
||||
"passed": false,
|
||||
"duration": 1078,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:42.372Z",
|
||||
"passed": false,
|
||||
"duration": 940,
|
||||
"reason": "Unknown error occurred"
|
||||
}
|
||||
]
|
||||
113
packages/kbot/tests/unit/test-schema.json
Normal file
113
packages/kbot/tests/unit/test-schema.json
Normal file
@ -0,0 +1,113 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"$id": "https://example.com/user-profile.schema.json",
|
||||
"title": "User Profile",
|
||||
"description": "A user profile containing name, age, and tags",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "User's full name",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Za-z\\s]+$"
|
||||
},
|
||||
"age": {
|
||||
"type": "number",
|
||||
"description": "User's age in years",
|
||||
"minimum": 0,
|
||||
"maximum": 150
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"description": "User's email address",
|
||||
"format": "email"
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"description": "List of user's tags",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"developer",
|
||||
"designer",
|
||||
"manager",
|
||||
"admin",
|
||||
"user"
|
||||
]
|
||||
},
|
||||
"minItems": 1,
|
||||
"maxItems": 5,
|
||||
"uniqueItems": true
|
||||
},
|
||||
"address": {
|
||||
"type": "object",
|
||||
"description": "User's address",
|
||||
"properties": {
|
||||
"street": {
|
||||
"type": "string",
|
||||
"description": "Street address"
|
||||
},
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "City name"
|
||||
},
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "Country name",
|
||||
"enum": [
|
||||
"US",
|
||||
"UK",
|
||||
"CA",
|
||||
"AU"
|
||||
]
|
||||
},
|
||||
"zipCode": {
|
||||
"type": "string",
|
||||
"description": "ZIP/Postal code",
|
||||
"pattern": "^[0-9]{5}(-[0-9]{4})?$"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"street",
|
||||
"city",
|
||||
"country"
|
||||
]
|
||||
},
|
||||
"preferences": {
|
||||
"type": "object",
|
||||
"description": "User preferences",
|
||||
"properties": {
|
||||
"theme": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"light",
|
||||
"dark",
|
||||
"system"
|
||||
],
|
||||
"default": "system"
|
||||
},
|
||||
"notifications": {
|
||||
"type": "boolean",
|
||||
"default": true
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"en",
|
||||
"es",
|
||||
"fr",
|
||||
"de",
|
||||
"ja"
|
||||
],
|
||||
"default": "en"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"age",
|
||||
"email"
|
||||
],
|
||||
"additionalProperties": false
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user