fix:response format for tests

This commit is contained in:
lovebird 2025-06-06 00:35:53 +02:00
parent d9dc88b972
commit 1535cb754a
15 changed files with 4661 additions and 283 deletions

View File

@ -4,6 +4,23 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Vitest: Debug Open File",
"program": "${workspaceFolder}/node_modules/vitest/vitest.mjs",
"args": [
"run",
"${relativeFile}"
],
"skipFiles": [
"<node_internals>/**"
],
"console": "integratedTerminal",
"sourceMaps": true,
"smartStep": true,
"internalConsoleOptions": "neverOpen"
},
{
"type": "node",
"request": "launch",
@ -768,5 +785,6 @@
"console": "integratedTerminal",
"outputCapture": "std"
}
]
],
"compounds": []
}

File diff suppressed because one or more lines are too long

View File

@ -3,7 +3,7 @@
"messages": [
{
"role": "user",
"content": "multiply 8 and 3. Return only the number, no explanation."
"content": "Provide a synonym for \"happy\". Return only the synonym, no explanation."
},
{
"role": "user",

View File

@ -9,6 +9,7 @@
"version": "0.3.5",
"license": "MIT",
"dependencies": {
"@dmitryrechkin/json-schema-to-zod": "1.0.1",
"@polymech/ai-tools": "file:../ai-tools",
"@polymech/cache": "file:../cache",
"@polymech/commons": "file:../commons",
@ -400,6 +401,14 @@
"node": ">=14.17.0"
}
},
"node_modules/@dmitryrechkin/json-schema-to-zod": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@dmitryrechkin/json-schema-to-zod/-/json-schema-to-zod-1.0.1.tgz",
"integrity": "sha512-cG9gC4NMu/7JZqmRZy6uIb+l+kxek2GFQ0/qrhw7xeFK2l5B9yF9FVuujoqFPLRGDHNFYqtBWht7hY4KB0ngrA==",
"dependencies": {
"zod": "^3.23.8"
}
},
"node_modules/@esbuild/aix-ppc64": {
"version": "0.21.5",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",

View File

@ -53,6 +53,7 @@
"examples:iterator-markdown:no-cache": "node dist-in/examples/core/iterator-markdown-example.js --no-cache"
},
"dependencies": {
"@dmitryrechkin/json-schema-to-zod": "1.0.1",
"@polymech/ai-tools": "file:../ai-tools",
"@polymech/cache": "file:../cache",
"@polymech/commons": "file:../commons",

View File

@ -88,6 +88,7 @@ export const complete_options = async (opts: IKBotTask): Promise<IKBotTask | nul
options.client = client
options.variables = { ...options.variables, ...variables(options) }
options.collector = collector(options, client)
options.format = opts.format
options.onRun = options.onRun || (async (options) => options)
return options
@ -174,7 +175,6 @@ export const complete_params = async (
if (options.mode === E_Mode.TOOLS || options.mode === E_Mode.ASSISTANT) {
params.tools = await loadTools(options)
params.tool_choice = 'auto'
//params.parallel_tool_calls = false
}
return params

View File

@ -5,16 +5,46 @@ import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { sync as mkdirp } from "mkdirp"
import { zodResponseFormat } from "openai/helpers/zod"
import { JSONSchemaToZod } from '@dmitryrechkin/json-schema-to-zod';
export enum ModelCategory {
FAST = 'fast',
LANGUAGE = 'language',
TOOL = 'tool',
ALL = 'all',
CODING = 'coding',
FILES = 'file'
FILES = 'file',
TEST_EQUAL = ''
}
export enum EqualityCheck {
DEFAULT = 'default',
JSON_EQUAL = 'json_equal',
LLM_EQUAL = 'llm_equal',
NONE = 'none'
}
export type EqualityFn = (actual: string, expected: string) => Promise<boolean>;
export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
[EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
return (actual || '').trim().toLowerCase() === (expected || '').trim().toLowerCase();
},
[EqualityCheck.JSON_EQUAL]: async (actual: string, expected: string): Promise<boolean> => {
try {
// we just stringify to normalize and compare
const actualJson = JSON.parse(actual.trim());
const expectedJson = JSON.parse(expected.trim());
return JSON.stringify(actualJson) === JSON.stringify(expectedJson);
} catch (e) {
return false;
}
},
[EqualityCheck.NONE]: async (): Promise<boolean> => {
return true;
},
};
export const getFastModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
@ -23,6 +53,12 @@ export const getFastModels = (): string[] => {
]
}
export const getTestEqualModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
]
}
export const getCodingModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_NVIDIA_LLAMA_3_3_NEMOTRON_SUPER_49B_V1_FREE
@ -31,8 +67,7 @@ export const getCodingModels = (): string[] => {
export const getFileModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
]
}
@ -68,6 +103,8 @@ export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST):
return getCodingModels()
case ModelCategory.FILES:
return getFileModels()
case ModelCategory.TEST_EQUAL:
return getTestEqualModels()
case ModelCategory.ALL:
default:
return [
@ -184,6 +221,11 @@ export const runTest = async (
let defaultOptions = {
filters: 'code'
}
let format: any = null
if (options.format) {
const zodSchema = JSONSchemaToZod.convert(options.format);
format = zodResponseFormat(zodSchema, "format");
}
try {
const result = await Promise.race([
run({
@ -194,7 +236,7 @@ export const runTest = async (
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
logLevel: 2,
...{ ...defaultOptions, ...options },
...{ ...defaultOptions, ...options, format },
onRun: async (options) => {
model = options.model || 'unknown'
router = options.model as string
@ -220,8 +262,10 @@ export const runTest = async (
reason: 'Model returned empty response'
}
} else {
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
const actual = result?.[0] || ''
const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
const passed = await checkFn(actual, expected)
testResult = {
test: testName,
@ -233,7 +277,7 @@ export const runTest = async (
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
reason: passed ? undefined : `Expected ${expected}, but got ${actual.trim()}`,
}
}
} catch (e) {

View File

@ -13,7 +13,8 @@ import {
runTest,
generateTestReport,
getReportPaths,
ModelCategory
ModelCategory,
EqualityCheck
} from './commons'
import { IKBotOptions } from '@polymech/commons'
@ -25,6 +26,42 @@ describe('File Operations', () => {
const TEST_LOG_PATH = getReportPaths('files', 'json')
const TEST_REPORT_PATH = getReportPaths('files', 'md')
it.each(models)('should identify animals in image files with model %s', async (modelName) => {
const result = await runTest(
'What animals are shown in these images?',
'["cat","fox"]',
'file-inclusion',
modelName,
TEST_LOG_PATH,
'completion',
{
include: ['tests/test-data/files/lazyfox.jpg', 'tests/test-data/files/cat.jpg'],
logLevel: 2,
equalityCheck: EqualityCheck.NONE,
format: {
type: "object",
properties: {
animals: {
type: "array",
items: {
type: "string"
},
minItems: 2,
maxItems: 2
}
},
required: ["animals"]
}
} as IKBotOptions
)
testResults.push(result)
const parsedResult = JSON.parse(result.result[0]?.trim())
const animals = parsedResult.animals.map((s: string) => s.toLowerCase())
expect(animals).toContain('cat')
expect(animals).toContain('fox')
expect(animals.length).toBe(2)
}, { timeout: TEST_TIMEOUT })
it.each(models)('should process single file with model %s', async (modelName) => {
const result = await runTest(
'What is the name of the algorithm implemented in these files? Return only the name.',
@ -53,6 +90,7 @@ describe('File Operations', () => {
'completion',
{
include: ['./tests/test-data/files/*.js'],
equalityCheck: EqualityCheck.NONE,
logLevel: 2,
format: {
type: "object",
@ -70,11 +108,12 @@ describe('File Operations', () => {
}
} as IKBotOptions
)
testResults.push(result)
testResults.push(result)
const parsedResult = JSON.parse(result.result[0]?.trim())
expect(parsedResult.includes('bubble'))
expect(parsedResult.includes('factorial'))
expect(parsedResult).toHaveLength(2)
const algorithms = parsedResult.algorithms.map((s: string) => s.toLowerCase())
expect(algorithms.some(a => a.includes('bubble'))).toBe(true)
expect(algorithms.some(a => a.includes('factorial'))).toBe(true)
expect(algorithms).toHaveLength(2)
}, { timeout: TEST_TIMEOUT })
it.each(models)('should process files in glob subdirectory with model %s', async (modelName) => {
@ -84,46 +123,13 @@ describe('File Operations', () => {
'file-inclusion',
modelName,
TEST_LOG_PATH,
'completion',
'completion',
{ include: ['tests/test-data/files/glob/data.json'] }
)
testResults.push(result)
expect(result.result[0]?.trim()).toBe('Injection Barrel')
}, { timeout: TEST_TIMEOUT })
it.each(models)('should identify animals in image files with model %s', async (modelName) => {
const result = await runTest(
'What animals are shown in these images? Return as JSON array.',
'["cat","fox"]',
'file-inclusion',
modelName,
TEST_LOG_PATH,
'completion',
{
include: ['tests/test-data/files/lazyfox.jpg'],
logLevel: 2,
format: {
type: "object",
properties: {
animals: {
type: "array",
items: {
type: "string"
},
minItems: 2,
maxItems: 2
}
},
required: ["animals"]
}
} as IKBotOptions
)
testResults.push(result)
const parsedResult = JSON.parse(result.result[0]?.trim())
expect(parsedResult.includes('cat'))
expect(parsedResult.includes('fox'))
expect(parsedResult.length==2)
}, { timeout: TEST_TIMEOUT })
it('should generate markdown report', () => {
generateTestReport(testResults, 'File Operations Test Results', TEST_REPORT_PATH)

File diff suppressed because it is too large Load Diff

View File

@ -180,6 +180,173 @@
"passed": true,
"duration": 626,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:15.716Z",
"passed": true,
"duration": 2024,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:16.361Z",
"passed": true,
"duration": 641,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:20.162Z",
"passed": true,
"duration": 3798,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:21.917Z",
"passed": true,
"duration": 1752,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:22.504Z",
"passed": true,
"duration": 585,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:25.779Z",
"passed": true,
"duration": 3272,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:27.557Z",
"passed": true,
"duration": 1775,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:28.041Z",
"passed": true,
"duration": 481,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:31.450Z",
"passed": true,
"duration": 3406,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:37.473Z",
"passed": true,
"duration": 6020,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:42.394Z",
"passed": true,
"duration": 4917,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:47.544Z",
"passed": false,
"duration": 5147,
"reason": "Model returned empty response",
"category": "basic"
}
],
"highscores": [
@ -188,8 +355,8 @@
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 738,
"duration_secs": 0.738
"duration": 641,
"duration_secs": 0.641
},
{
"model": "openai/gpt-3.5-turbo",
@ -201,30 +368,30 @@
{
"test": "multiplication",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 585,
"duration_secs": 0.585
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 624,
"duration_secs": 0.624
},
{
"model": "openai/gpt-4o-mini",
"duration": 626,
"duration_secs": 0.626
}
]
},
{
"test": "division",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 481,
"duration_secs": 0.481
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 513,
"duration_secs": 0.513
},
{
"model": "openai/gpt-4o-mini",
"duration": 895,
"duration_secs": 0.895
}
]
},
@ -238,11 +405,11 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 4358,
"duration_secs": 4.358
"duration": 4917,
"duration_secs": 4.917
}
]
}
],
"lastUpdated": "2025-06-05T18:56:45.466Z"
"lastUpdated": "2025-06-05T21:19:47.545Z"
}

View File

@ -6,89 +6,125 @@
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| addition | openai/gpt-4o-mini | 514 | 0.51 |
| addition | openai/gpt-3.5-turbo | 771 | 0.77 |
| multiplication | openai/gpt-3.5-turbo | 624 | 0.62 |
| multiplication | openai/gpt-4o-mini | 721 | 0.72 |
| division | openai/gpt-3.5-turbo | 513 | 0.51 |
| division | openai/gpt-4o-mini | 895 | 0.90 |
| web_content | openai/gpt-3.5-turbo | 220 | 0.22 |
| web_content | openai/gpt-4o-mini | 4358 | 4.36 |
| addition | openai/gpt-4o-mini | 641 | 0.64 |
| addition | anthropic/claude-sonnet-4 | 2024 | 2.02 |
| addition | deepseek/deepseek-r1:free | 3798 | 3.80 |
| multiplication | openai/gpt-4o-mini | 585 | 0.58 |
| multiplication | anthropic/claude-sonnet-4 | 1752 | 1.75 |
| multiplication | deepseek/deepseek-r1:free | 3272 | 3.27 |
| division | openai/gpt-4o-mini | 481 | 0.48 |
| division | anthropic/claude-sonnet-4 | 1775 | 1.77 |
| division | deepseek/deepseek-r1:free | 3406 | 3.41 |
| web_content | openai/gpt-4o-mini | 4917 | 4.92 |
| web_content | deepseek/deepseek-r1:free | 5147 | 5.15 |
| web_content | anthropic/claude-sonnet-4 | 6020 | 6.02 |
## Summary
- Total Tests: 8
- Passed: 7
- Total Tests: 12
- Passed: 11
- Failed: 1
- Success Rate: 87.50%
- Average Duration: 1077ms (1.08s)
- Success Rate: 91.67%
- Average Duration: 2818ms (2.82s)
## Failed Tests
### web_content - openai/gpt-3.5-turbo
### web_content - deepseek/deepseek-r1:free
- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
- Expected: `yes`
- Actual: ``
- Duration: 220ms (0.22s)
- Duration: 5147ms (5.15s)
- Reason: Model returned empty response
- Timestamp: 6/5/2025, 8:46:11 PM
- Timestamp: 6/5/2025, 11:19:47 PM
## Passed Tests
### addition - openai/gpt-3.5-turbo
### addition - anthropic/claude-sonnet-4
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 771ms (0.77s)
- Timestamp: 6/5/2025, 8:46:08 PM
- Duration: 2024ms (2.02s)
- Timestamp: 6/5/2025, 11:19:15 PM
### addition - openai/gpt-4o-mini
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 514ms (0.51s)
- Timestamp: 6/5/2025, 8:46:08 PM
- Duration: 641ms (0.64s)
- Timestamp: 6/5/2025, 11:19:16 PM
### multiplication - openai/gpt-3.5-turbo
### addition - deepseek/deepseek-r1:free
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 3798ms (3.80s)
- Timestamp: 6/5/2025, 11:19:20 PM
### multiplication - anthropic/claude-sonnet-4
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 624ms (0.62s)
- Timestamp: 6/5/2025, 8:46:09 PM
- Duration: 1752ms (1.75s)
- Timestamp: 6/5/2025, 11:19:21 PM
### multiplication - openai/gpt-4o-mini
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 721ms (0.72s)
- Timestamp: 6/5/2025, 8:46:09 PM
- Duration: 585ms (0.58s)
- Timestamp: 6/5/2025, 11:19:22 PM
### division - openai/gpt-3.5-turbo
### multiplication - deepseek/deepseek-r1:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 3272ms (3.27s)
- Timestamp: 6/5/2025, 11:19:25 PM
### division - anthropic/claude-sonnet-4
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 513ms (0.51s)
- Timestamp: 6/5/2025, 8:46:10 PM
- Duration: 1775ms (1.77s)
- Timestamp: 6/5/2025, 11:19:27 PM
### division - openai/gpt-4o-mini
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 895ms (0.90s)
- Timestamp: 6/5/2025, 8:46:11 PM
- Duration: 481ms (0.48s)
- Timestamp: 6/5/2025, 11:19:28 PM
### division - deepseek/deepseek-r1:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 3406ms (3.41s)
- Timestamp: 6/5/2025, 11:19:31 PM
### web_content - anthropic/claude-sonnet-4
- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
- Expected: `yes`
- Actual: `yes`
- Duration: 6020ms (6.02s)
- Timestamp: 6/5/2025, 11:19:37 PM
### web_content - openai/gpt-4o-mini
- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
- Expected: `yes`
- Actual: `Yes`
- Duration: 4358ms (4.36s)
- Timestamp: 6/5/2025, 8:46:15 PM
- Actual: `yes`
- Duration: 4917ms (4.92s)
- Timestamp: 6/5/2025, 11:19:42 PM

File diff suppressed because it is too large Load Diff

View File

@ -6,44 +6,27 @@
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| file-inclusion | openai/gpt-4o-mini | 2223 | 2.22 |
| file-inclusion | google/gemini-2.0-flash-exp:free | 2404 | 2.40 |
| file-inclusion | openai/gpt-4o | 614 | 0.61 |
## Summary
- Total Tests: 8
- Passed: 2
- Failed: 6
- Success Rate: 25.00%
- Average Duration: 1671ms (1.67s)
- Total Tests: 4
- Passed: 3
- Failed: 1
- Success Rate: 75.00%
- Average Duration: 1380ms (1.38s)
## Failed Tests
### file-inclusion - openai/gpt-4o-mini
- Prompt: `What animals are shown in these images? Return as JSON array.`
- Expected: `["cat","fox"]`
- Actual: `["cat", "fox"]`
- Duration: 2223ms (2.22s)
- Reason: Expected ["cat","fox"], but got ["cat", "fox"]
- Timestamp: 6/5/2025, 8:46:17 PM
### file-inclusion - google/gemini-2.0-flash-exp:free
- Prompt: `What animals are shown in these images? Return as JSON array.`
- Expected: `["cat","fox"]`
- Actual: `[
"cat",
"fox"
]`
- Duration: 2404ms (2.40s)
- Reason: Expected ["cat","fox"], but got [
"cat",
"fox"
]
- Timestamp: 6/5/2025, 8:46:20 PM
*No failed tests*
## Passed Tests
*No passed tests*
### file-inclusion - openai/gpt-4o
- Prompt: `What is the title of the product in data.json? Return only the title.`
- Expected: `Injection Barrel`
- Actual: `Injection Barrel`
- Duration: 614ms (0.61s)
- Timestamp: 6/6/2025, 12:29:46 AM

View File

@ -1451,21 +1451,237 @@
"passed": true,
"duration": 2616,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:30:21.412Z",
"passed": true,
"duration": 1560,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:30:22.869Z",
"passed": true,
"duration": 1451,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:30:28.307Z",
"passed": true,
"duration": 5434,
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:30:29.513Z",
"passed": false,
"duration": 1201,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:30:30.212Z",
"passed": false,
"duration": 695,
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:30:33.611Z",
"passed": false,
"duration": 3395,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A brown fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:30:34.920Z",
"passed": false,
"duration": 1304,
"reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:30:35.620Z",
"passed": false,
"duration": 692,
"reason": "Expected A fox jumps over a dog, but got A fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"\"A quick brown fox leaps over a dog.\""
],
"expected": "A fox jumps over a dog",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:30:49.662Z",
"passed": false,
"duration": 14038,
"reason": "Expected A fox jumps over a dog, but got \"A quick brown fox leaps over a dog.\"",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:30:50.805Z",
"passed": true,
"duration": 1137,
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:30:51.269Z",
"passed": true,
"duration": 459,
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:30:55.198Z",
"passed": true,
"duration": 3924,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:30:56.455Z",
"passed": true,
"duration": 1251,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:30:57.083Z",
"passed": true,
"duration": 622,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"joyful"
],
"expected": "joyful",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:31:00.924Z",
"passed": true,
"duration": 3836,
"category": "language"
}
],
"highscores": [
{
"test": "translation",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 666,
"duration_secs": 0.666
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 818,
"duration_secs": 0.818
},
{
"model": "openai/gpt-4o-mini",
"duration": 1451,
"duration_secs": 1.451
}
]
},
@ -1479,8 +1695,8 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 1171,
"duration_secs": 1.171
"duration": 695,
"duration_secs": 0.695
}
]
},
@ -1494,41 +1710,41 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 699,
"duration_secs": 0.699
"duration": 692,
"duration_secs": 0.692
}
]
},
{
"test": "language_detection",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 459,
"duration_secs": 0.459
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 695,
"duration_secs": 0.695
},
{
"model": "openai/gpt-4o-mini",
"duration": 776,
"duration_secs": 0.776
}
]
},
{
"test": "synonyms",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 548,
"duration_secs": 0.548
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 570,
"duration_secs": 0.57
},
{
"model": "openai/gpt-4o-mini",
"duration": 622,
"duration_secs": 0.622
}
]
}
],
"lastUpdated": "2025-06-05T18:56:08.627Z"
"lastUpdated": "2025-06-05T22:31:00.924Z"
}

View File

@ -6,164 +6,157 @@
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| translation | openai/gpt-4o-mini | 666 | 0.67 |
| translation | anthropic/claude-sonnet-4 | 1317 | 1.32 |
| translation | deepseek/deepseek-r1:free | 5397 | 5.40 |
| grammar | openai/gpt-4o-mini | 1171 | 1.17 |
| grammar | anthropic/claude-sonnet-4 | 1722 | 1.72 |
| grammar | deepseek/deepseek-r1:free | 5199 | 5.20 |
| summarization | openai/gpt-4o-mini | 699 | 0.70 |
| summarization | anthropic/claude-sonnet-4 | 1820 | 1.82 |
| summarization | deepseek/deepseek-r1:free | 7380 | 7.38 |
| language_detection | openai/gpt-4o-mini | 776 | 0.78 |
| language_detection | anthropic/claude-sonnet-4 | 1725 | 1.73 |
| language_detection | deepseek/deepseek-r1:free | 5247 | 5.25 |
| synonyms | openai/gpt-4o-mini | 548 | 0.55 |
| synonyms | anthropic/claude-sonnet-4 | 1967 | 1.97 |
| synonyms | deepseek/deepseek-r1:free | 2616 | 2.62 |
| translation | openai/gpt-4o-mini | 1451 | 1.45 |
| translation | anthropic/claude-sonnet-4 | 1560 | 1.56 |
| translation | deepseek/deepseek-r1:free | 5434 | 5.43 |
| grammar | openai/gpt-4o-mini | 695 | 0.69 |
| grammar | anthropic/claude-sonnet-4 | 1201 | 1.20 |
| grammar | deepseek/deepseek-r1:free | 3395 | 3.40 |
| summarization | openai/gpt-4o-mini | 692 | 0.69 |
| summarization | anthropic/claude-sonnet-4 | 1304 | 1.30 |
| summarization | deepseek/deepseek-r1:free | 14038 | 14.04 |
| language_detection | openai/gpt-4o-mini | 459 | 0.46 |
| language_detection | anthropic/claude-sonnet-4 | 1137 | 1.14 |
| language_detection | deepseek/deepseek-r1:free | 3924 | 3.92 |
| synonyms | openai/gpt-4o-mini | 622 | 0.62 |
| synonyms | anthropic/claude-sonnet-4 | 1251 | 1.25 |
| synonyms | deepseek/deepseek-r1:free | 3836 | 3.84 |
## Summary
- Total Tests: 15
- Passed: 2
- Failed: 13
- Success Rate: 13.33%
- Average Duration: 2550ms (2.55s)
- Passed: 9
- Failed: 6
- Success Rate: 60.00%
- Average Duration: 2733ms (2.73s)
## Failed Tests
### translation - anthropic/claude-sonnet-4
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 1317ms (1.32s)
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
- Timestamp: 6/5/2025, 8:55:31 PM
### translation - openai/gpt-4o-mini
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 666ms (0.67s)
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
- Timestamp: 6/5/2025, 8:55:32 PM
### translation - deepseek/deepseek-r1:free
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 5397ms (5.40s)
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
- Timestamp: 6/5/2025, 8:55:37 PM
### grammar - anthropic/claude-sonnet-4
- Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
- Expected: `I went to the store yesterday`
- Actual: `I went to the store yesterday.`
- Duration: 1722ms (1.72s)
- Reason: Expected I went to the store yesterday, but got i went to the store yesterday.
- Timestamp: 6/5/2025, 8:55:39 PM
- Duration: 1201ms (1.20s)
- Reason: Expected I went to the store yesterday, but got I went to the store yesterday.
- Timestamp: 6/6/2025, 12:30:29 AM
### grammar - openai/gpt-4o-mini
- Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
- Expected: `I went to the store yesterday`
- Actual: `"I went to the store yesterday."`
- Duration: 1171ms (1.17s)
- Reason: Expected I went to the store yesterday, but got "i went to the store yesterday."
- Timestamp: 6/5/2025, 8:55:40 PM
- Duration: 695ms (0.69s)
- Reason: Expected I went to the store yesterday, but got "I went to the store yesterday."
- Timestamp: 6/6/2025, 12:30:30 AM
### grammar - deepseek/deepseek-r1:free
- Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
- Expected: `I went to the store yesterday`
- Actual: `"I went to the store yesterday."`
- Duration: 5199ms (5.20s)
- Reason: Expected I went to the store yesterday, but got "i went to the store yesterday."
- Timestamp: 6/5/2025, 8:55:45 PM
- Actual: `I went to the store yesterday.`
- Duration: 3395ms (3.40s)
- Reason: Expected I went to the store yesterday, but got I went to the store yesterday.
- Timestamp: 6/6/2025, 12:30:33 AM
### summarization - anthropic/claude-sonnet-4
- Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
- Expected: `A fox jumps over a dog`
- Actual: `A fox jumps over a dog.`
- Duration: 1820ms (1.82s)
- Reason: Expected A fox jumps over a dog, but got a fox jumps over a dog.
- Timestamp: 6/5/2025, 8:55:47 PM
- Actual: `A brown fox leaps over a dog.`
- Duration: 1304ms (1.30s)
- Reason: Expected A fox jumps over a dog, but got A brown fox leaps over a dog.
- Timestamp: 6/6/2025, 12:30:34 AM
### summarization - openai/gpt-4o-mini
- Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
- Expected: `A fox jumps over a dog`
- Actual: `A fox jumps over a dog.`
- Duration: 699ms (0.70s)
- Reason: Expected A fox jumps over a dog, but got a fox jumps over a dog.
- Timestamp: 6/5/2025, 8:55:48 PM
- Duration: 692ms (0.69s)
- Reason: Expected A fox jumps over a dog, but got A fox jumps over a dog.
- Timestamp: 6/6/2025, 12:30:35 AM
### summarization - deepseek/deepseek-r1:free
- Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
- Expected: `A fox jumps over a dog`
- Actual: `A quick brown fox leaps over a dog.`
- Duration: 7380ms (7.38s)
- Reason: Expected A fox jumps over a dog, but got a quick brown fox leaps over a dog.
- Timestamp: 6/5/2025, 8:55:55 PM
- Actual: `"A quick brown fox leaps over a dog."`
- Duration: 14038ms (14.04s)
- Reason: Expected A fox jumps over a dog, but got "A quick brown fox leaps over a dog."
- Timestamp: 6/6/2025, 12:30:49 AM
## Passed Tests
### translation - anthropic/claude-sonnet-4
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 1560ms (1.56s)
- Timestamp: 6/6/2025, 12:30:21 AM
### translation - openai/gpt-4o-mini
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 1451ms (1.45s)
- Timestamp: 6/6/2025, 12:30:22 AM
### translation - deepseek/deepseek-r1:free
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 5434ms (5.43s)
- Timestamp: 6/6/2025, 12:30:28 AM
### language_detection - anthropic/claude-sonnet-4
- Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
- Expected: `French`
- Actual: `French`
- Duration: 1725ms (1.73s)
- Reason: Expected French, but got french
- Timestamp: 6/5/2025, 8:55:57 PM
- Duration: 1137ms (1.14s)
- Timestamp: 6/6/2025, 12:30:50 AM
### language_detection - openai/gpt-4o-mini
- Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
- Expected: `French`
- Actual: `French`
- Duration: 776ms (0.78s)
- Reason: Expected French, but got french
- Timestamp: 6/5/2025, 8:55:58 PM
- Duration: 459ms (0.46s)
- Timestamp: 6/6/2025, 12:30:51 AM
### language_detection - deepseek/deepseek-r1:free
- Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
- Expected: `French`
- Actual: `French`
- Duration: 5247ms (5.25s)
- Reason: Expected French, but got french
- Timestamp: 6/5/2025, 8:56:03 PM
- Duration: 3924ms (3.92s)
- Timestamp: 6/6/2025, 12:30:55 AM
### synonyms - anthropic/claude-sonnet-4
- Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
- Expected: `joyful`
- Actual: `Content`
- Duration: 1967ms (1.97s)
- Reason: Expected joyful, but got content
- Timestamp: 6/5/2025, 8:56:05 PM
## Passed Tests
- Actual: `Joyful`
- Duration: 1251ms (1.25s)
- Timestamp: 6/6/2025, 12:30:56 AM
### synonyms - openai/gpt-4o-mini
- Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
- Expected: `joyful`
- Actual: `Joyful`
- Duration: 548ms (0.55s)
- Timestamp: 6/5/2025, 8:56:06 PM
- Duration: 622ms (0.62s)
- Timestamp: 6/6/2025, 12:30:57 AM
### synonyms - deepseek/deepseek-r1:free
- Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
- Expected: `joyful`
- Actual: `Joyful`
- Duration: 2616ms (2.62s)
- Timestamp: 6/5/2025, 8:56:08 PM
- Actual: `joyful`
- Duration: 3836ms (3.84s)
- Timestamp: 6/6/2025, 12:31:00 AM