fix:response format for tests
This commit is contained in:
parent
d9dc88b972
commit
1535cb754a
20
packages/kbot/.vscode/launch.json
vendored
20
packages/kbot/.vscode/launch.json
vendored
@ -4,6 +4,23 @@
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"name": "Vitest: Debug Open File",
|
||||
"program": "${workspaceFolder}/node_modules/vitest/vitest.mjs",
|
||||
"args": [
|
||||
"run",
|
||||
"${relativeFile}"
|
||||
],
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"console": "integratedTerminal",
|
||||
"sourceMaps": true,
|
||||
"smartStep": true,
|
||||
"internalConsoleOptions": "neverOpen"
|
||||
},
|
||||
{
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
@ -768,5 +785,6 @@
|
||||
"console": "integratedTerminal",
|
||||
"outputCapture": "std"
|
||||
}
|
||||
]
|
||||
],
|
||||
"compounds": []
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@ -3,7 +3,7 @@
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "multiply 8 and 3. Return only the number, no explanation."
|
||||
"content": "Provide a synonym for \"happy\". Return only the synonym, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
9
packages/kbot/package-lock.json
generated
9
packages/kbot/package-lock.json
generated
@ -9,6 +9,7 @@
|
||||
"version": "0.3.5",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@dmitryrechkin/json-schema-to-zod": "1.0.1",
|
||||
"@polymech/ai-tools": "file:../ai-tools",
|
||||
"@polymech/cache": "file:../cache",
|
||||
"@polymech/commons": "file:../commons",
|
||||
@ -400,6 +401,14 @@
|
||||
"node": ">=14.17.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@dmitryrechkin/json-schema-to-zod": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@dmitryrechkin/json-schema-to-zod/-/json-schema-to-zod-1.0.1.tgz",
|
||||
"integrity": "sha512-cG9gC4NMu/7JZqmRZy6uIb+l+kxek2GFQ0/qrhw7xeFK2l5B9yF9FVuujoqFPLRGDHNFYqtBWht7hY4KB0ngrA==",
|
||||
"dependencies": {
|
||||
"zod": "^3.23.8"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/aix-ppc64": {
|
||||
"version": "0.21.5",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
|
||||
|
||||
@ -53,6 +53,7 @@
|
||||
"examples:iterator-markdown:no-cache": "node dist-in/examples/core/iterator-markdown-example.js --no-cache"
|
||||
},
|
||||
"dependencies": {
|
||||
"@dmitryrechkin/json-schema-to-zod": "1.0.1",
|
||||
"@polymech/ai-tools": "file:../ai-tools",
|
||||
"@polymech/cache": "file:../cache",
|
||||
"@polymech/commons": "file:../commons",
|
||||
|
||||
@ -88,6 +88,7 @@ export const complete_options = async (opts: IKBotTask): Promise<IKBotTask | nul
|
||||
options.client = client
|
||||
options.variables = { ...options.variables, ...variables(options) }
|
||||
options.collector = collector(options, client)
|
||||
options.format = opts.format
|
||||
options.onRun = options.onRun || (async (options) => options)
|
||||
|
||||
return options
|
||||
@ -174,7 +175,6 @@ export const complete_params = async (
|
||||
if (options.mode === E_Mode.TOOLS || options.mode === E_Mode.ASSISTANT) {
|
||||
params.tools = await loadTools(options)
|
||||
params.tool_choice = 'auto'
|
||||
//params.parallel_tool_calls = false
|
||||
}
|
||||
|
||||
return params
|
||||
|
||||
@ -5,16 +5,46 @@ import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { sync as mkdirp } from "mkdirp"
|
||||
|
||||
import { zodResponseFormat } from "openai/helpers/zod"
|
||||
import { JSONSchemaToZod } from '@dmitryrechkin/json-schema-to-zod';
|
||||
export enum ModelCategory {
|
||||
FAST = 'fast',
|
||||
LANGUAGE = 'language',
|
||||
TOOL = 'tool',
|
||||
ALL = 'all',
|
||||
CODING = 'coding',
|
||||
FILES = 'file'
|
||||
FILES = 'file',
|
||||
TEST_EQUAL = ''
|
||||
}
|
||||
|
||||
export enum EqualityCheck {
|
||||
DEFAULT = 'default',
|
||||
JSON_EQUAL = 'json_equal',
|
||||
LLM_EQUAL = 'llm_equal',
|
||||
NONE = 'none'
|
||||
}
|
||||
|
||||
export type EqualityFn = (actual: string, expected: string) => Promise<boolean>;
|
||||
|
||||
export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
|
||||
[EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
|
||||
return (actual || '').trim().toLowerCase() === (expected || '').trim().toLowerCase();
|
||||
},
|
||||
[EqualityCheck.JSON_EQUAL]: async (actual: string, expected: string): Promise<boolean> => {
|
||||
try {
|
||||
// we just stringify to normalize and compare
|
||||
const actualJson = JSON.parse(actual.trim());
|
||||
const expectedJson = JSON.parse(expected.trim());
|
||||
return JSON.stringify(actualJson) === JSON.stringify(expectedJson);
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
[EqualityCheck.NONE]: async (): Promise<boolean> => {
|
||||
return true;
|
||||
},
|
||||
};
|
||||
|
||||
export const getFastModels = (): string[] => {
|
||||
return [
|
||||
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
|
||||
@ -23,6 +53,12 @@ export const getFastModels = (): string[] => {
|
||||
]
|
||||
}
|
||||
|
||||
export const getTestEqualModels = (): string[] => {
|
||||
return [
|
||||
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
|
||||
]
|
||||
}
|
||||
|
||||
export const getCodingModels = (): string[] => {
|
||||
return [
|
||||
E_OPENROUTER_MODEL.MODEL_NVIDIA_LLAMA_3_3_NEMOTRON_SUPER_49B_V1_FREE
|
||||
@ -31,8 +67,7 @@ export const getCodingModels = (): string[] => {
|
||||
|
||||
export const getFileModels = (): string[] => {
|
||||
return [
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
||||
E_OPENROUTER_MODEL.MODEL_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
|
||||
]
|
||||
}
|
||||
|
||||
@ -68,6 +103,8 @@ export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST):
|
||||
return getCodingModels()
|
||||
case ModelCategory.FILES:
|
||||
return getFileModels()
|
||||
case ModelCategory.TEST_EQUAL:
|
||||
return getTestEqualModels()
|
||||
case ModelCategory.ALL:
|
||||
default:
|
||||
return [
|
||||
@ -184,6 +221,11 @@ export const runTest = async (
|
||||
let defaultOptions = {
|
||||
filters: 'code'
|
||||
}
|
||||
let format: any = null
|
||||
if (options.format) {
|
||||
const zodSchema = JSONSchemaToZod.convert(options.format);
|
||||
format = zodResponseFormat(zodSchema, "format");
|
||||
}
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
@ -194,7 +236,7 @@ export const runTest = async (
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
logLevel: 2,
|
||||
...{ ...defaultOptions, ...options },
|
||||
...{ ...defaultOptions, ...options, format },
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.model as string
|
||||
@ -220,8 +262,10 @@ export const runTest = async (
|
||||
reason: 'Model returned empty response'
|
||||
}
|
||||
} else {
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
const actual = result?.[0] || ''
|
||||
const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
|
||||
const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
|
||||
const passed = await checkFn(actual, expected)
|
||||
|
||||
testResult = {
|
||||
test: testName,
|
||||
@ -233,7 +277,7 @@ export const runTest = async (
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual.trim()}`,
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
|
||||
@ -13,7 +13,8 @@ import {
|
||||
runTest,
|
||||
generateTestReport,
|
||||
getReportPaths,
|
||||
ModelCategory
|
||||
ModelCategory,
|
||||
EqualityCheck
|
||||
} from './commons'
|
||||
import { IKBotOptions } from '@polymech/commons'
|
||||
|
||||
@ -25,6 +26,42 @@ describe('File Operations', () => {
|
||||
const TEST_LOG_PATH = getReportPaths('files', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('files', 'md')
|
||||
|
||||
it.each(models)('should identify animals in image files with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'What animals are shown in these images?',
|
||||
'["cat","fox"]',
|
||||
'file-inclusion',
|
||||
modelName,
|
||||
TEST_LOG_PATH,
|
||||
'completion',
|
||||
{
|
||||
include: ['tests/test-data/files/lazyfox.jpg', 'tests/test-data/files/cat.jpg'],
|
||||
logLevel: 2,
|
||||
equalityCheck: EqualityCheck.NONE,
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
animals: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
},
|
||||
minItems: 2,
|
||||
maxItems: 2
|
||||
}
|
||||
},
|
||||
required: ["animals"]
|
||||
}
|
||||
} as IKBotOptions
|
||||
)
|
||||
testResults.push(result)
|
||||
const parsedResult = JSON.parse(result.result[0]?.trim())
|
||||
const animals = parsedResult.animals.map((s: string) => s.toLowerCase())
|
||||
expect(animals).toContain('cat')
|
||||
expect(animals).toContain('fox')
|
||||
expect(animals.length).toBe(2)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should process single file with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'What is the name of the algorithm implemented in these files? Return only the name.',
|
||||
@ -53,6 +90,7 @@ describe('File Operations', () => {
|
||||
'completion',
|
||||
{
|
||||
include: ['./tests/test-data/files/*.js'],
|
||||
equalityCheck: EqualityCheck.NONE,
|
||||
logLevel: 2,
|
||||
format: {
|
||||
type: "object",
|
||||
@ -70,11 +108,12 @@ describe('File Operations', () => {
|
||||
}
|
||||
} as IKBotOptions
|
||||
)
|
||||
testResults.push(result)
|
||||
testResults.push(result)
|
||||
const parsedResult = JSON.parse(result.result[0]?.trim())
|
||||
expect(parsedResult.includes('bubble'))
|
||||
expect(parsedResult.includes('factorial'))
|
||||
expect(parsedResult).toHaveLength(2)
|
||||
const algorithms = parsedResult.algorithms.map((s: string) => s.toLowerCase())
|
||||
expect(algorithms.some(a => a.includes('bubble'))).toBe(true)
|
||||
expect(algorithms.some(a => a.includes('factorial'))).toBe(true)
|
||||
expect(algorithms).toHaveLength(2)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should process files in glob subdirectory with model %s', async (modelName) => {
|
||||
@ -84,46 +123,13 @@ describe('File Operations', () => {
|
||||
'file-inclusion',
|
||||
modelName,
|
||||
TEST_LOG_PATH,
|
||||
'completion',
|
||||
'completion',
|
||||
{ include: ['tests/test-data/files/glob/data.json'] }
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()).toBe('Injection Barrel')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should identify animals in image files with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'What animals are shown in these images? Return as JSON array.',
|
||||
'["cat","fox"]',
|
||||
'file-inclusion',
|
||||
modelName,
|
||||
TEST_LOG_PATH,
|
||||
'completion',
|
||||
{
|
||||
include: ['tests/test-data/files/lazyfox.jpg'],
|
||||
logLevel: 2,
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
animals: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
},
|
||||
minItems: 2,
|
||||
maxItems: 2
|
||||
}
|
||||
},
|
||||
required: ["animals"]
|
||||
}
|
||||
} as IKBotOptions
|
||||
)
|
||||
testResults.push(result)
|
||||
const parsedResult = JSON.parse(result.result[0]?.trim())
|
||||
expect(parsedResult.includes('cat'))
|
||||
expect(parsedResult.includes('fox'))
|
||||
expect(parsedResult.length==2)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
generateTestReport(testResults, 'File Operations Test Results', TEST_REPORT_PATH)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -180,6 +180,173 @@
|
||||
"passed": true,
|
||||
"duration": 626,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T21:19:15.716Z",
|
||||
"passed": true,
|
||||
"duration": 2024,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T21:19:16.361Z",
|
||||
"passed": true,
|
||||
"duration": 641,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T21:19:20.162Z",
|
||||
"passed": true,
|
||||
"duration": 3798,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T21:19:21.917Z",
|
||||
"passed": true,
|
||||
"duration": 1752,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T21:19:22.504Z",
|
||||
"passed": true,
|
||||
"duration": 585,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T21:19:25.779Z",
|
||||
"passed": true,
|
||||
"duration": 3272,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T21:19:27.557Z",
|
||||
"passed": true,
|
||||
"duration": 1775,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T21:19:28.041Z",
|
||||
"passed": true,
|
||||
"duration": 481,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T21:19:31.450Z",
|
||||
"passed": true,
|
||||
"duration": 3406,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "web_content",
|
||||
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
||||
"result": [
|
||||
"yes"
|
||||
],
|
||||
"expected": "yes",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T21:19:37.473Z",
|
||||
"passed": true,
|
||||
"duration": 6020,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "web_content",
|
||||
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
||||
"result": [
|
||||
"yes"
|
||||
],
|
||||
"expected": "yes",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T21:19:42.394Z",
|
||||
"passed": true,
|
||||
"duration": 4917,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "web_content",
|
||||
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
||||
"result": [],
|
||||
"expected": "yes",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T21:19:47.544Z",
|
||||
"passed": false,
|
||||
"duration": 5147,
|
||||
"reason": "Model returned empty response",
|
||||
"category": "basic"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -188,8 +355,8 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 738,
|
||||
"duration_secs": 0.738
|
||||
"duration": 641,
|
||||
"duration_secs": 0.641
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
@ -201,30 +368,30 @@
|
||||
{
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 585,
|
||||
"duration_secs": 0.585
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 624,
|
||||
"duration_secs": 0.624
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 626,
|
||||
"duration_secs": 0.626
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 481,
|
||||
"duration_secs": 0.481
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 513,
|
||||
"duration_secs": 0.513
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 895,
|
||||
"duration_secs": 0.895
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -238,11 +405,11 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 4358,
|
||||
"duration_secs": 4.358
|
||||
"duration": 4917,
|
||||
"duration_secs": 4.917
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-06-05T18:56:45.466Z"
|
||||
"lastUpdated": "2025-06-05T21:19:47.545Z"
|
||||
}
|
||||
@ -6,89 +6,125 @@
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| addition | openai/gpt-4o-mini | 514 | 0.51 |
|
||||
| addition | openai/gpt-3.5-turbo | 771 | 0.77 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 624 | 0.62 |
|
||||
| multiplication | openai/gpt-4o-mini | 721 | 0.72 |
|
||||
| division | openai/gpt-3.5-turbo | 513 | 0.51 |
|
||||
| division | openai/gpt-4o-mini | 895 | 0.90 |
|
||||
| web_content | openai/gpt-3.5-turbo | 220 | 0.22 |
|
||||
| web_content | openai/gpt-4o-mini | 4358 | 4.36 |
|
||||
| addition | openai/gpt-4o-mini | 641 | 0.64 |
|
||||
| addition | anthropic/claude-sonnet-4 | 2024 | 2.02 |
|
||||
| addition | deepseek/deepseek-r1:free | 3798 | 3.80 |
|
||||
| multiplication | openai/gpt-4o-mini | 585 | 0.58 |
|
||||
| multiplication | anthropic/claude-sonnet-4 | 1752 | 1.75 |
|
||||
| multiplication | deepseek/deepseek-r1:free | 3272 | 3.27 |
|
||||
| division | openai/gpt-4o-mini | 481 | 0.48 |
|
||||
| division | anthropic/claude-sonnet-4 | 1775 | 1.77 |
|
||||
| division | deepseek/deepseek-r1:free | 3406 | 3.41 |
|
||||
| web_content | openai/gpt-4o-mini | 4917 | 4.92 |
|
||||
| web_content | deepseek/deepseek-r1:free | 5147 | 5.15 |
|
||||
| web_content | anthropic/claude-sonnet-4 | 6020 | 6.02 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 8
|
||||
- Passed: 7
|
||||
- Total Tests: 12
|
||||
- Passed: 11
|
||||
- Failed: 1
|
||||
- Success Rate: 87.50%
|
||||
- Average Duration: 1077ms (1.08s)
|
||||
- Success Rate: 91.67%
|
||||
- Average Duration: 2818ms (2.82s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### web_content - openai/gpt-3.5-turbo
|
||||
### web_content - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
|
||||
- Expected: `yes`
|
||||
- Actual: ``
|
||||
- Duration: 220ms (0.22s)
|
||||
- Duration: 5147ms (5.15s)
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 6/5/2025, 8:46:11 PM
|
||||
- Timestamp: 6/5/2025, 11:19:47 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### addition - openai/gpt-3.5-turbo
|
||||
### addition - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 771ms (0.77s)
|
||||
- Timestamp: 6/5/2025, 8:46:08 PM
|
||||
- Duration: 2024ms (2.02s)
|
||||
- Timestamp: 6/5/2025, 11:19:15 PM
|
||||
|
||||
### addition - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 514ms (0.51s)
|
||||
- Timestamp: 6/5/2025, 8:46:08 PM
|
||||
- Duration: 641ms (0.64s)
|
||||
- Timestamp: 6/5/2025, 11:19:16 PM
|
||||
|
||||
### multiplication - openai/gpt-3.5-turbo
|
||||
### addition - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 3798ms (3.80s)
|
||||
- Timestamp: 6/5/2025, 11:19:20 PM
|
||||
|
||||
### multiplication - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 624ms (0.62s)
|
||||
- Timestamp: 6/5/2025, 8:46:09 PM
|
||||
- Duration: 1752ms (1.75s)
|
||||
- Timestamp: 6/5/2025, 11:19:21 PM
|
||||
|
||||
### multiplication - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 721ms (0.72s)
|
||||
- Timestamp: 6/5/2025, 8:46:09 PM
|
||||
- Duration: 585ms (0.58s)
|
||||
- Timestamp: 6/5/2025, 11:19:22 PM
|
||||
|
||||
### division - openai/gpt-3.5-turbo
|
||||
### multiplication - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 3272ms (3.27s)
|
||||
- Timestamp: 6/5/2025, 11:19:25 PM
|
||||
|
||||
### division - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 513ms (0.51s)
|
||||
- Timestamp: 6/5/2025, 8:46:10 PM
|
||||
- Duration: 1775ms (1.77s)
|
||||
- Timestamp: 6/5/2025, 11:19:27 PM
|
||||
|
||||
### division - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 895ms (0.90s)
|
||||
- Timestamp: 6/5/2025, 8:46:11 PM
|
||||
- Duration: 481ms (0.48s)
|
||||
- Timestamp: 6/5/2025, 11:19:28 PM
|
||||
|
||||
### division - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 3406ms (3.41s)
|
||||
- Timestamp: 6/5/2025, 11:19:31 PM
|
||||
|
||||
### web_content - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
|
||||
- Expected: `yes`
|
||||
- Actual: `yes`
|
||||
- Duration: 6020ms (6.02s)
|
||||
- Timestamp: 6/5/2025, 11:19:37 PM
|
||||
|
||||
### web_content - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
|
||||
- Expected: `yes`
|
||||
- Actual: `Yes`
|
||||
- Duration: 4358ms (4.36s)
|
||||
- Timestamp: 6/5/2025, 8:46:15 PM
|
||||
- Actual: `yes`
|
||||
- Duration: 4917ms (4.92s)
|
||||
- Timestamp: 6/5/2025, 11:19:42 PM
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -6,44 +6,27 @@
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| file-inclusion | openai/gpt-4o-mini | 2223 | 2.22 |
|
||||
| file-inclusion | google/gemini-2.0-flash-exp:free | 2404 | 2.40 |
|
||||
| file-inclusion | openai/gpt-4o | 614 | 0.61 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 8
|
||||
- Passed: 2
|
||||
- Failed: 6
|
||||
- Success Rate: 25.00%
|
||||
- Average Duration: 1671ms (1.67s)
|
||||
- Total Tests: 4
|
||||
- Passed: 3
|
||||
- Failed: 1
|
||||
- Success Rate: 75.00%
|
||||
- Average Duration: 1380ms (1.38s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### file-inclusion - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `What animals are shown in these images? Return as JSON array.`
|
||||
- Expected: `["cat","fox"]`
|
||||
- Actual: `["cat", "fox"]`
|
||||
- Duration: 2223ms (2.22s)
|
||||
- Reason: Expected ["cat","fox"], but got ["cat", "fox"]
|
||||
- Timestamp: 6/5/2025, 8:46:17 PM
|
||||
|
||||
### file-inclusion - google/gemini-2.0-flash-exp:free
|
||||
|
||||
- Prompt: `What animals are shown in these images? Return as JSON array.`
|
||||
- Expected: `["cat","fox"]`
|
||||
- Actual: `[
|
||||
"cat",
|
||||
"fox"
|
||||
]`
|
||||
- Duration: 2404ms (2.40s)
|
||||
- Reason: Expected ["cat","fox"], but got [
|
||||
"cat",
|
||||
"fox"
|
||||
]
|
||||
- Timestamp: 6/5/2025, 8:46:20 PM
|
||||
*No failed tests*
|
||||
|
||||
## Passed Tests
|
||||
|
||||
*No passed tests*
|
||||
### file-inclusion - openai/gpt-4o
|
||||
|
||||
- Prompt: `What is the title of the product in data.json? Return only the title.`
|
||||
- Expected: `Injection Barrel`
|
||||
- Actual: `Injection Barrel`
|
||||
- Duration: 614ms (0.61s)
|
||||
- Timestamp: 6/6/2025, 12:29:46 AM
|
||||
|
||||
|
||||
@ -1451,21 +1451,237 @@
|
||||
"passed": true,
|
||||
"duration": 2616,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:30:21.412Z",
|
||||
"passed": true,
|
||||
"duration": 1560,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:30:22.869Z",
|
||||
"passed": true,
|
||||
"duration": 1451,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:30:28.307Z",
|
||||
"passed": true,
|
||||
"duration": 5434,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:30:29.513Z",
|
||||
"passed": false,
|
||||
"duration": 1201,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"\"I went to the store yesterday.\""
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:30:30.212Z",
|
||||
"passed": false,
|
||||
"duration": 695,
|
||||
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:30:33.611Z",
|
||||
"passed": false,
|
||||
"duration": 3395,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A brown fox leaps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:30:34.920Z",
|
||||
"passed": false,
|
||||
"duration": 1304,
|
||||
"reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:30:35.620Z",
|
||||
"passed": false,
|
||||
"duration": 692,
|
||||
"reason": "Expected A fox jumps over a dog, but got A fox jumps over a dog.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"\"A quick brown fox leaps over a dog.\""
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:30:49.662Z",
|
||||
"passed": false,
|
||||
"duration": 14038,
|
||||
"reason": "Expected A fox jumps over a dog, but got \"A quick brown fox leaps over a dog.\"",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:30:50.805Z",
|
||||
"passed": true,
|
||||
"duration": 1137,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:30:51.269Z",
|
||||
"passed": true,
|
||||
"duration": 459,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:30:55.198Z",
|
||||
"passed": true,
|
||||
"duration": 3924,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "synonyms",
|
||||
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
||||
"result": [
|
||||
"Joyful"
|
||||
],
|
||||
"expected": "joyful",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:30:56.455Z",
|
||||
"passed": true,
|
||||
"duration": 1251,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "synonyms",
|
||||
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
||||
"result": [
|
||||
"Joyful"
|
||||
],
|
||||
"expected": "joyful",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:30:57.083Z",
|
||||
"passed": true,
|
||||
"duration": 622,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "synonyms",
|
||||
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
||||
"result": [
|
||||
"joyful"
|
||||
],
|
||||
"expected": "joyful",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:31:00.924Z",
|
||||
"passed": true,
|
||||
"duration": 3836,
|
||||
"category": "language"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
{
|
||||
"test": "translation",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 666,
|
||||
"duration_secs": 0.666
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 818,
|
||||
"duration_secs": 0.818
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1451,
|
||||
"duration_secs": 1.451
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1479,8 +1695,8 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1171,
|
||||
"duration_secs": 1.171
|
||||
"duration": 695,
|
||||
"duration_secs": 0.695
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1494,41 +1710,41 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 699,
|
||||
"duration_secs": 0.699
|
||||
"duration": 692,
|
||||
"duration_secs": 0.692
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 459,
|
||||
"duration_secs": 0.459
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 695,
|
||||
"duration_secs": 0.695
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 776,
|
||||
"duration_secs": 0.776
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "synonyms",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 548,
|
||||
"duration_secs": 0.548
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 570,
|
||||
"duration_secs": 0.57
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 622,
|
||||
"duration_secs": 0.622
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-06-05T18:56:08.627Z"
|
||||
"lastUpdated": "2025-06-05T22:31:00.924Z"
|
||||
}
|
||||
@ -6,164 +6,157 @@
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| translation | openai/gpt-4o-mini | 666 | 0.67 |
|
||||
| translation | anthropic/claude-sonnet-4 | 1317 | 1.32 |
|
||||
| translation | deepseek/deepseek-r1:free | 5397 | 5.40 |
|
||||
| grammar | openai/gpt-4o-mini | 1171 | 1.17 |
|
||||
| grammar | anthropic/claude-sonnet-4 | 1722 | 1.72 |
|
||||
| grammar | deepseek/deepseek-r1:free | 5199 | 5.20 |
|
||||
| summarization | openai/gpt-4o-mini | 699 | 0.70 |
|
||||
| summarization | anthropic/claude-sonnet-4 | 1820 | 1.82 |
|
||||
| summarization | deepseek/deepseek-r1:free | 7380 | 7.38 |
|
||||
| language_detection | openai/gpt-4o-mini | 776 | 0.78 |
|
||||
| language_detection | anthropic/claude-sonnet-4 | 1725 | 1.73 |
|
||||
| language_detection | deepseek/deepseek-r1:free | 5247 | 5.25 |
|
||||
| synonyms | openai/gpt-4o-mini | 548 | 0.55 |
|
||||
| synonyms | anthropic/claude-sonnet-4 | 1967 | 1.97 |
|
||||
| synonyms | deepseek/deepseek-r1:free | 2616 | 2.62 |
|
||||
| translation | openai/gpt-4o-mini | 1451 | 1.45 |
|
||||
| translation | anthropic/claude-sonnet-4 | 1560 | 1.56 |
|
||||
| translation | deepseek/deepseek-r1:free | 5434 | 5.43 |
|
||||
| grammar | openai/gpt-4o-mini | 695 | 0.69 |
|
||||
| grammar | anthropic/claude-sonnet-4 | 1201 | 1.20 |
|
||||
| grammar | deepseek/deepseek-r1:free | 3395 | 3.40 |
|
||||
| summarization | openai/gpt-4o-mini | 692 | 0.69 |
|
||||
| summarization | anthropic/claude-sonnet-4 | 1304 | 1.30 |
|
||||
| summarization | deepseek/deepseek-r1:free | 14038 | 14.04 |
|
||||
| language_detection | openai/gpt-4o-mini | 459 | 0.46 |
|
||||
| language_detection | anthropic/claude-sonnet-4 | 1137 | 1.14 |
|
||||
| language_detection | deepseek/deepseek-r1:free | 3924 | 3.92 |
|
||||
| synonyms | openai/gpt-4o-mini | 622 | 0.62 |
|
||||
| synonyms | anthropic/claude-sonnet-4 | 1251 | 1.25 |
|
||||
| synonyms | deepseek/deepseek-r1:free | 3836 | 3.84 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 15
|
||||
- Passed: 2
|
||||
- Failed: 13
|
||||
- Success Rate: 13.33%
|
||||
- Average Duration: 2550ms (2.55s)
|
||||
- Passed: 9
|
||||
- Failed: 6
|
||||
- Success Rate: 60.00%
|
||||
- Average Duration: 2733ms (2.73s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### translation - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `¡Hola, mundo!`
|
||||
- Actual: `¡Hola, mundo!`
|
||||
- Duration: 1317ms (1.32s)
|
||||
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
|
||||
- Timestamp: 6/5/2025, 8:55:31 PM
|
||||
|
||||
### translation - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `¡Hola, mundo!`
|
||||
- Actual: `¡Hola, mundo!`
|
||||
- Duration: 666ms (0.67s)
|
||||
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
|
||||
- Timestamp: 6/5/2025, 8:55:32 PM
|
||||
|
||||
### translation - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `¡Hola, mundo!`
|
||||
- Actual: `¡Hola, mundo!`
|
||||
- Duration: 5397ms (5.40s)
|
||||
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
|
||||
- Timestamp: 6/5/2025, 8:55:37 PM
|
||||
|
||||
### grammar - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
|
||||
- Expected: `I went to the store yesterday`
|
||||
- Actual: `I went to the store yesterday.`
|
||||
- Duration: 1722ms (1.72s)
|
||||
- Reason: Expected I went to the store yesterday, but got i went to the store yesterday.
|
||||
- Timestamp: 6/5/2025, 8:55:39 PM
|
||||
- Duration: 1201ms (1.20s)
|
||||
- Reason: Expected I went to the store yesterday, but got I went to the store yesterday.
|
||||
- Timestamp: 6/6/2025, 12:30:29 AM
|
||||
|
||||
### grammar - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
|
||||
- Expected: `I went to the store yesterday`
|
||||
- Actual: `"I went to the store yesterday."`
|
||||
- Duration: 1171ms (1.17s)
|
||||
- Reason: Expected I went to the store yesterday, but got "i went to the store yesterday."
|
||||
- Timestamp: 6/5/2025, 8:55:40 PM
|
||||
- Duration: 695ms (0.69s)
|
||||
- Reason: Expected I went to the store yesterday, but got "I went to the store yesterday."
|
||||
- Timestamp: 6/6/2025, 12:30:30 AM
|
||||
|
||||
### grammar - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
|
||||
- Expected: `I went to the store yesterday`
|
||||
- Actual: `"I went to the store yesterday."`
|
||||
- Duration: 5199ms (5.20s)
|
||||
- Reason: Expected I went to the store yesterday, but got "i went to the store yesterday."
|
||||
- Timestamp: 6/5/2025, 8:55:45 PM
|
||||
- Actual: `I went to the store yesterday.`
|
||||
- Duration: 3395ms (3.40s)
|
||||
- Reason: Expected I went to the store yesterday, but got I went to the store yesterday.
|
||||
- Timestamp: 6/6/2025, 12:30:33 AM
|
||||
|
||||
### summarization - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
|
||||
- Expected: `A fox jumps over a dog`
|
||||
- Actual: `A fox jumps over a dog.`
|
||||
- Duration: 1820ms (1.82s)
|
||||
- Reason: Expected A fox jumps over a dog, but got a fox jumps over a dog.
|
||||
- Timestamp: 6/5/2025, 8:55:47 PM
|
||||
- Actual: `A brown fox leaps over a dog.`
|
||||
- Duration: 1304ms (1.30s)
|
||||
- Reason: Expected A fox jumps over a dog, but got A brown fox leaps over a dog.
|
||||
- Timestamp: 6/6/2025, 12:30:34 AM
|
||||
|
||||
### summarization - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
|
||||
- Expected: `A fox jumps over a dog`
|
||||
- Actual: `A fox jumps over a dog.`
|
||||
- Duration: 699ms (0.70s)
|
||||
- Reason: Expected A fox jumps over a dog, but got a fox jumps over a dog.
|
||||
- Timestamp: 6/5/2025, 8:55:48 PM
|
||||
- Duration: 692ms (0.69s)
|
||||
- Reason: Expected A fox jumps over a dog, but got A fox jumps over a dog.
|
||||
- Timestamp: 6/6/2025, 12:30:35 AM
|
||||
|
||||
### summarization - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
|
||||
- Expected: `A fox jumps over a dog`
|
||||
- Actual: `A quick brown fox leaps over a dog.`
|
||||
- Duration: 7380ms (7.38s)
|
||||
- Reason: Expected A fox jumps over a dog, but got a quick brown fox leaps over a dog.
|
||||
- Timestamp: 6/5/2025, 8:55:55 PM
|
||||
- Actual: `"A quick brown fox leaps over a dog."`
|
||||
- Duration: 14038ms (14.04s)
|
||||
- Reason: Expected A fox jumps over a dog, but got "A quick brown fox leaps over a dog."
|
||||
- Timestamp: 6/6/2025, 12:30:49 AM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### translation - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `¡Hola, mundo!`
|
||||
- Actual: `¡Hola, mundo!`
|
||||
- Duration: 1560ms (1.56s)
|
||||
- Timestamp: 6/6/2025, 12:30:21 AM
|
||||
|
||||
### translation - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `¡Hola, mundo!`
|
||||
- Actual: `¡Hola, mundo!`
|
||||
- Duration: 1451ms (1.45s)
|
||||
- Timestamp: 6/6/2025, 12:30:22 AM
|
||||
|
||||
### translation - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `¡Hola, mundo!`
|
||||
- Actual: `¡Hola, mundo!`
|
||||
- Duration: 5434ms (5.43s)
|
||||
- Timestamp: 6/6/2025, 12:30:28 AM
|
||||
|
||||
### language_detection - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
|
||||
- Expected: `French`
|
||||
- Actual: `French`
|
||||
- Duration: 1725ms (1.73s)
|
||||
- Reason: Expected French, but got french
|
||||
- Timestamp: 6/5/2025, 8:55:57 PM
|
||||
- Duration: 1137ms (1.14s)
|
||||
- Timestamp: 6/6/2025, 12:30:50 AM
|
||||
|
||||
### language_detection - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
|
||||
- Expected: `French`
|
||||
- Actual: `French`
|
||||
- Duration: 776ms (0.78s)
|
||||
- Reason: Expected French, but got french
|
||||
- Timestamp: 6/5/2025, 8:55:58 PM
|
||||
- Duration: 459ms (0.46s)
|
||||
- Timestamp: 6/6/2025, 12:30:51 AM
|
||||
|
||||
### language_detection - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
|
||||
- Expected: `French`
|
||||
- Actual: `French`
|
||||
- Duration: 5247ms (5.25s)
|
||||
- Reason: Expected French, but got french
|
||||
- Timestamp: 6/5/2025, 8:56:03 PM
|
||||
- Duration: 3924ms (3.92s)
|
||||
- Timestamp: 6/6/2025, 12:30:55 AM
|
||||
|
||||
### synonyms - anthropic/claude-sonnet-4
|
||||
|
||||
- Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
|
||||
- Expected: `joyful`
|
||||
- Actual: `Content`
|
||||
- Duration: 1967ms (1.97s)
|
||||
- Reason: Expected joyful, but got content
|
||||
- Timestamp: 6/5/2025, 8:56:05 PM
|
||||
|
||||
## Passed Tests
|
||||
- Actual: `Joyful`
|
||||
- Duration: 1251ms (1.25s)
|
||||
- Timestamp: 6/6/2025, 12:30:56 AM
|
||||
|
||||
### synonyms - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
|
||||
- Expected: `joyful`
|
||||
- Actual: `Joyful`
|
||||
- Duration: 548ms (0.55s)
|
||||
- Timestamp: 6/5/2025, 8:56:06 PM
|
||||
- Duration: 622ms (0.62s)
|
||||
- Timestamp: 6/6/2025, 12:30:57 AM
|
||||
|
||||
### synonyms - deepseek/deepseek-r1:free
|
||||
|
||||
- Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
|
||||
- Expected: `joyful`
|
||||
- Actual: `Joyful`
|
||||
- Duration: 2616ms (2.62s)
|
||||
- Timestamp: 6/5/2025, 8:56:08 PM
|
||||
- Actual: `joyful`
|
||||
- Duration: 3836ms (3.84s)
|
||||
- Timestamp: 6/6/2025, 12:31:00 AM
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user