test commons
This commit is contained in:
parent
856ffe680f
commit
1c3a2d981e
@ -1,541 +0,0 @@
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:39.340Z",
|
||||
"passed": true,
|
||||
"duration": 1551
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:42.962Z",
|
||||
"passed": true,
|
||||
"duration": 3621
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:43.836Z",
|
||||
"passed": true,
|
||||
"duration": 873
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:47.309Z",
|
||||
"passed": true,
|
||||
"duration": 3472
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:48.493Z",
|
||||
"passed": true,
|
||||
"duration": 1183
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:53.335Z",
|
||||
"passed": true,
|
||||
"duration": 4841
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:37.069Z",
|
||||
"passed": true,
|
||||
"duration": 1256
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:40.167Z",
|
||||
"passed": false,
|
||||
"duration": 3096,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error",
|
||||
"details": {
|
||||
"stack": "Error: Model returned empty response\n at Module.runTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:85:13)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at __vite_ssr_import_0__.it.each.timeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\basic.test.ts:21:20)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"message": "Model returned empty response"
|
||||
}
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:41.086Z",
|
||||
"passed": true,
|
||||
"duration": 916
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:45.879Z",
|
||||
"passed": true,
|
||||
"duration": 4793
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:46.900Z",
|
||||
"passed": true,
|
||||
"duration": 1020
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:50.446Z",
|
||||
"passed": true,
|
||||
"duration": 3545
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:39:58.836Z",
|
||||
"passed": true,
|
||||
"duration": 1266
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:02.777Z",
|
||||
"passed": true,
|
||||
"duration": 3939
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:03.961Z",
|
||||
"passed": true,
|
||||
"duration": 1183
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:06.962Z",
|
||||
"passed": true,
|
||||
"duration": 3000
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:08.115Z",
|
||||
"passed": true,
|
||||
"duration": 1152
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:12.565Z",
|
||||
"passed": true,
|
||||
"duration": 4449
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:52.176Z",
|
||||
"passed": true,
|
||||
"duration": 1458
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:55.869Z",
|
||||
"passed": true,
|
||||
"duration": 3691
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:57.106Z",
|
||||
"passed": true,
|
||||
"duration": 1236
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:59.974Z",
|
||||
"passed": true,
|
||||
"duration": 2867
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:42:01.272Z",
|
||||
"passed": true,
|
||||
"duration": 1297
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:42:04.326Z",
|
||||
"passed": true,
|
||||
"duration": 3053
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:39.002Z",
|
||||
"passed": true,
|
||||
"duration": 1196
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:48.668Z",
|
||||
"passed": true,
|
||||
"duration": 9664
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:49.806Z",
|
||||
"passed": true,
|
||||
"duration": 1137
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:53.017Z",
|
||||
"passed": true,
|
||||
"duration": 3210
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:53.814Z",
|
||||
"passed": true,
|
||||
"duration": 796
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:55.383Z",
|
||||
"passed": true,
|
||||
"duration": 1568
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:47:00.955Z",
|
||||
"passed": true,
|
||||
"duration": 1297
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:09.343Z",
|
||||
"passed": true,
|
||||
"duration": 1278
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:15.630Z",
|
||||
"passed": true,
|
||||
"duration": 6285
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:16.246Z",
|
||||
"passed": true,
|
||||
"duration": 615
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:25.857Z",
|
||||
"passed": true,
|
||||
"duration": 9610
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:27.101Z",
|
||||
"passed": true,
|
||||
"duration": 1242
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:31.142Z",
|
||||
"passed": true,
|
||||
"duration": 4040
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
{
|
||||
"test": "addition",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1278,
|
||||
"duration_secs": 1.278
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 6285,
|
||||
"duration_secs": 6.285
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 615,
|
||||
"duration_secs": 0.615
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 9610,
|
||||
"duration_secs": 9.61
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1242,
|
||||
"duration_secs": 1.242
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 4040,
|
||||
"duration_secs": 4.04
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T10:49:31.142Z"
|
||||
}
|
||||
@ -2,20 +2,24 @@ import { describe, it, expect } from 'vitest'
|
||||
import * as path from 'node:path'
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import {
|
||||
models,
|
||||
getDefaultModels,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
runTest,
|
||||
generateTestReport
|
||||
generateTestReport,
|
||||
getReportPaths
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
|
||||
// Optionally override models for this specific test file
|
||||
const models = getDefaultModels()
|
||||
|
||||
describe('Basic Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
const TEST_LOG_PATH = getReportPaths('basic', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('basic', 'md')
|
||||
|
||||
it.each(models)('should add two numbers with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
@ -27,7 +31,7 @@ describe('Basic Operations', () => {
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8')
|
||||
}, { timeout: 10000 })
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
@ -39,7 +43,7 @@ describe('Basic Operations', () => {
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('24')
|
||||
}, { timeout: 10000 })
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should divide two numbers with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
@ -51,11 +55,10 @@ describe('Basic Operations', () => {
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('5')
|
||||
}, { timeout: 10000 })
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
const reportPath = path.resolve(__dirname, './basic-report.md')
|
||||
generateTestReport(testResults, 'Basic Operations Test Results', reportPath)
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
generateTestReport(testResults, 'Basic Operations Test Results', TEST_REPORT_PATH)
|
||||
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -4,18 +4,36 @@ import { run } from '../../src/index'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { sync as mkdirp } from "mkdirp"
|
||||
|
||||
export const models = [
|
||||
export const getDefaultModels = () => [
|
||||
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
|
||||
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B
|
||||
]
|
||||
|
||||
export const isOpenRouterModel = (model: string): boolean => {
|
||||
return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL)
|
||||
}
|
||||
|
||||
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
||||
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
||||
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
|
||||
|
||||
// Report paths configuration
|
||||
export const REPORTS_DIR = path.resolve(__dirname, './reports')
|
||||
|
||||
// Ensure reports directory exists
|
||||
if (exists(REPORTS_DIR) !== 'directory') {
|
||||
mkdirp(REPORTS_DIR)
|
||||
}
|
||||
|
||||
export const getReportPaths = (category: string, type: 'json' | 'md'): string => {
|
||||
const base = path.resolve(REPORTS_DIR, category)
|
||||
return `${base}.${type}`
|
||||
}
|
||||
|
||||
export interface TestResult {
|
||||
test: string;
|
||||
prompt: string;
|
||||
@ -106,7 +124,7 @@ export const runTest = async (
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
router = isOpenRouterModel(model) ? 'openrouter' : 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
@ -163,7 +181,7 @@ export const runTest = async (
|
||||
throw e
|
||||
} finally {
|
||||
if (testResult) {
|
||||
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath)) : { results: [], highscores: [] }
|
||||
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
|
||||
const updatedResults = [...(existingData.results || []), testResult]
|
||||
|
||||
// Group results by test and model
|
||||
|
||||
@ -1,61 +0,0 @@
|
||||
# Format Test Results
|
||||
|
||||
## Failed Tests
|
||||
|
||||
*No failed tests*
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### json-schema-file-format - mistralai/mistral-tiny
|
||||
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
|
||||
- Actual: `{"name": "John Doe", "age": 30, "tags": ["developer", "javascript"]}`
|
||||
- Duration: 982ms
|
||||
- Timestamp: 4/2/2025, 10:26:33 AM
|
||||
|
||||
### json-schema-object-format - mistralai/mistral-tiny
|
||||
- Prompt: `Create a user profile with the following details:
|
||||
- Name: Jane Smith
|
||||
- Age: 25
|
||||
- Email: jane.smith@company.com
|
||||
- Tags: ["developer", "designer"]
|
||||
- Address: {
|
||||
"street": "123 Main St",
|
||||
"city": "New York",
|
||||
"country": "US",
|
||||
"postal_code": "10001"
|
||||
}
|
||||
- Preferences: {
|
||||
"theme": "light",
|
||||
"notifications": "enabled",
|
||||
"language": "English"
|
||||
}
|
||||
Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"Jane Smith","age":25,"email":"jane.smith@company.com","tags":["developer","designer"],"address":{"street":"123 Main St","city":"New York","country":"US","postal_code":"10001"},"preferences":{"theme":"light","notifications":"enabled","language":"English"}}`
|
||||
- Actual: `{
|
||||
"name": "Jane Smith",
|
||||
"age": 25,
|
||||
"email": "jane.smith@company.com",
|
||||
"tags": ["developer", "designer"],
|
||||
"address": {
|
||||
"street": "123 Main St",
|
||||
"city": "New York",
|
||||
"country": "US",
|
||||
"postal_code": "10001"
|
||||
},
|
||||
"Preferences": {
|
||||
"theme": "light",
|
||||
"notifications": "enabled",
|
||||
"language": "English"
|
||||
}
|
||||
}`
|
||||
- Duration: 1673ms
|
||||
- Timestamp: 4/2/2025, 10:26:35 AM
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 2
|
||||
- Passed: 2
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
|
||||
@ -1,28 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test": "json-schema-file-format",
|
||||
"prompt": "Create a user profile with name John Doe, age 30, and tags [\"developer\", \"javascript\"]. Return only the JSON object, no explanation.",
|
||||
"result": [
|
||||
"{\"name\": \"John Doe\", \"age\": 30, \"tags\": [\"developer\", \"javascript\"]}"
|
||||
],
|
||||
"expected": "{\"name\":\"John Doe\",\"age\":30,\"tags\":[\"developer\",\"javascript\"]}",
|
||||
"model": "mistralai/mistral-tiny",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:26:33.513Z",
|
||||
"passed": true,
|
||||
"duration": 982
|
||||
},
|
||||
{
|
||||
"test": "json-schema-object-format",
|
||||
"prompt": "Create a user profile with the following details:\n - Name: Jane Smith\n - Age: 25\n - Email: jane.smith@company.com\n - Tags: [\"developer\", \"designer\"]\n - Address: {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n }\n - Preferences: {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n Return only the JSON object, no explanation.",
|
||||
"result": [
|
||||
"{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"Preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}"
|
||||
],
|
||||
"expected": "{\"name\":\"Jane Smith\",\"age\":25,\"email\":\"jane.smith@company.com\",\"tags\":[\"developer\",\"designer\"],\"address\":{\"street\":\"123 Main St\",\"city\":\"New York\",\"country\":\"US\",\"postal_code\":\"10001\"},\"preferences\":{\"theme\":\"light\",\"notifications\":\"enabled\",\"language\":\"English\"}}",
|
||||
"model": "mistralai/mistral-tiny",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:26:35.187Z",
|
||||
"passed": true,
|
||||
"duration": 1673
|
||||
}
|
||||
]
|
||||
@ -6,16 +6,23 @@ import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { z } from 'zod'
|
||||
import {
|
||||
getDefaultModels,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse
|
||||
isEmptyResponse,
|
||||
runTest,
|
||||
generateTestReport,
|
||||
getReportPaths
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
|
||||
// Optionally override models for this specific test file
|
||||
const models = getDefaultModels()
|
||||
|
||||
const TEST_LOG_PATH = getReportPaths('format', 'json')
|
||||
const TEST_SCHEMA_PATH = path.resolve(__dirname, './test-schema.json')
|
||||
const TEST_MODEL_FAST = 'mistralai/codestral-2501'
|
||||
const TEST_MODEL = 'mistralai/mistral-tiny'
|
||||
@ -136,6 +143,76 @@ const hasValidArrayLength = (arr: any[], length: number) => {
|
||||
return Array.isArray(arr) && arr.length === length && arr.every(item => typeof item === 'string')
|
||||
}
|
||||
|
||||
describe('Format Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
const TEST_REPORT_PATH = getReportPaths('format', 'md')
|
||||
|
||||
it.each(models)('should format JSON with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Format this JSON: {"name":"John","age":30}. Return only the formatted JSON, no explanation.',
|
||||
'{\n "name": "John",\n "age": 30\n}',
|
||||
'json_formatting',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('{\n "name": "john",\n "age": 30\n}')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should format markdown with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Format this markdown: #title ##subtitle text. Return only the formatted markdown, no explanation.',
|
||||
'# Title\n\n## Subtitle\n\nText',
|
||||
'markdown_formatting',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('# title\n\n## subtitle\n\ntext')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should format code with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Format this code: function add(a,b){return a+b}. Return only the formatted code, no explanation.',
|
||||
'function add(a, b) {\n return a + b;\n}',
|
||||
'code_formatting',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('function add(a, b) {\n return a + b;\n}')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should format date with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Format this date: 2024-03-15. Return only the formatted date in MM/DD/YYYY format, no explanation.',
|
||||
'03/15/2024',
|
||||
'date_formatting',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('03/15/2024')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should format currency with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Format this number as USD currency: 1234.56. Return only the formatted currency, no explanation.',
|
||||
'$1,234.56',
|
||||
'currency_formatting',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('$1,234.56')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
generateTestReport(testResults, 'Format Operations Test Results', TEST_REPORT_PATH)
|
||||
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
|
||||
describe('Format Options', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
@ -416,92 +493,4 @@ describe('Format Options', () => {
|
||||
}
|
||||
}
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Format Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
if (result.error.details?.message) {
|
||||
report += `- Error Details: ${result.error.details.message}\n`
|
||||
}
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Add summary section
|
||||
report += '## Summary\n\n'
|
||||
const totalTests = testResults.length
|
||||
const passedTests = testResults.filter(r => r.passed).length
|
||||
const failedTests = totalTests - passedTests
|
||||
report += `- Total Tests: ${totalTests}\n`
|
||||
report += `- Passed: ${passedTests}\n`
|
||||
report += `- Failed: ${failedTests}\n`
|
||||
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './format-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -1,234 +0,0 @@
|
||||
# Language Test Results
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### german - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 985ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:50 PM
|
||||
|
||||
### german - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 746ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:51 PM
|
||||
|
||||
### german - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 1067ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:05:52 PM
|
||||
|
||||
### german - anthropic/claude-2.0
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 1192ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:55 PM
|
||||
|
||||
### german - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 1753ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:45 AM
|
||||
|
||||
### german - qwen/qwq-32b
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 13757ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:59 AM
|
||||
|
||||
### spanish - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 678ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:53 PM
|
||||
|
||||
### spanish - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 744ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:53 PM
|
||||
|
||||
### spanish - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 1125ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:05:55 PM
|
||||
|
||||
### spanish - anthropic/claude-2.0
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 1193ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:56 PM
|
||||
|
||||
### spanish - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 1242ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:51 AM
|
||||
|
||||
### spanish - qwen/qwq-32b
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 5925ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:57 AM
|
||||
|
||||
### french - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 626ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:55 PM
|
||||
|
||||
### french - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 1341ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:05:57 PM
|
||||
|
||||
### french - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 729ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:56 PM
|
||||
|
||||
### french - anthropic/claude-2.0
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 968ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:57 PM
|
||||
|
||||
### french - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 2656ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:59 AM
|
||||
|
||||
### french - qwen/qwq-32b
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 4063ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:30:03 AM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### german_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:01 PM
|
||||
|
||||
### german_translation - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:02 PM
|
||||
|
||||
### german_translation - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:32 PM
|
||||
|
||||
### spanish_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:05 PM
|
||||
|
||||
### spanish_translation - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:06 PM
|
||||
|
||||
### spanish_translation - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:35 PM
|
||||
|
||||
### french_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:08 PM
|
||||
|
||||
### french_translation - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:10 PM
|
||||
|
||||
### french_translation - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:37 PM
|
||||
|
||||
@ -1,919 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:01.527Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:03.169Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:04.655Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:05.726Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:08.264Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:09.514Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:11.281Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non\n"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:12.942Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:14.089Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:53.360Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:54.551Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:55.699Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:56.740Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:57.918Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:58.882Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:52:00.119Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Non\n"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:52:01.184Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:52:02.204Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:50.924Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:52.477Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:53.546Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:56.815Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:58.718Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:54:01.673Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:01.199Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:02.857Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:03.788Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:05.218Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:06.317Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:07.436Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:08.879Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non\n"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:10.365Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:11.295Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:32.735Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:35.276Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:37.673Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:07.944Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:09.471Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:11.335Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:12.740Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:14.246Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:15.205Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:18.139Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:20.589Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:09.890Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:12.312Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:14.660Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:50.723Z",
|
||||
"passed": false,
|
||||
"duration": 985,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:51.471Z",
|
||||
"passed": false,
|
||||
"duration": 746,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:52.540Z",
|
||||
"passed": false,
|
||||
"duration": 1067,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:53.219Z",
|
||||
"passed": false,
|
||||
"duration": 678,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:53.964Z",
|
||||
"passed": false,
|
||||
"duration": 744,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:55.090Z",
|
||||
"passed": false,
|
||||
"duration": 1125,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:55.717Z",
|
||||
"passed": false,
|
||||
"duration": 626,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:56.447Z",
|
||||
"passed": false,
|
||||
"duration": 729,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:57.790Z",
|
||||
"passed": false,
|
||||
"duration": 1341,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:47:02.933Z",
|
||||
"passed": false,
|
||||
"duration": 1416,
|
||||
"error": {
|
||||
"message": "getRouterForModel is not defined",
|
||||
"code": "UNKNOWN",
|
||||
"type": "ReferenceError",
|
||||
"details": {
|
||||
"stack": "ReferenceError: getRouterForModel is not defined\n at runTranslationTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:78:19)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:106:5\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"message": "getRouterForModel is not defined",
|
||||
"stackStr": "ReferenceError: getRouterForModel is not defined\n at runTranslationTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:78:19)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:106:5\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"nameStr": "ReferenceError",
|
||||
"expected": "undefined",
|
||||
"actual": "undefined"
|
||||
}
|
||||
},
|
||||
"reason": "getRouterForModel is not defined"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:47:04.007Z",
|
||||
"passed": false,
|
||||
"duration": 1071,
|
||||
"error": {
|
||||
"message": "getRouterForModel is not defined",
|
||||
"code": "UNKNOWN",
|
||||
"type": "ReferenceError",
|
||||
"details": {
|
||||
"stack": "ReferenceError: getRouterForModel is not defined\n at runTranslationTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:78:19)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:115:5\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"message": "getRouterForModel is not defined",
|
||||
"stackStr": "ReferenceError: getRouterForModel is not defined\n at runTranslationTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:78:19)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:115:5\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"nameStr": "ReferenceError",
|
||||
"expected": "undefined",
|
||||
"actual": "undefined"
|
||||
}
|
||||
},
|
||||
"reason": "getRouterForModel is not defined"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:47:05.064Z",
|
||||
"passed": false,
|
||||
"duration": 1056,
|
||||
"error": {
|
||||
"message": "getRouterForModel is not defined",
|
||||
"code": "UNKNOWN",
|
||||
"type": "ReferenceError",
|
||||
"details": {
|
||||
"stack": "ReferenceError: getRouterForModel is not defined\n at runTranslationTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:78:19)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\language.test.ts:124:5\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"message": "getRouterForModel is not defined"
|
||||
}
|
||||
},
|
||||
"reason": "getRouterForModel is not defined"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:47:26.798Z",
|
||||
"passed": false,
|
||||
"duration": 1253,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:47:27.731Z",
|
||||
"passed": false,
|
||||
"duration": 932,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:47:28.596Z",
|
||||
"passed": false,
|
||||
"duration": 864,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:55.163Z",
|
||||
"passed": false,
|
||||
"duration": 1192,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:56.357Z",
|
||||
"passed": false,
|
||||
"duration": 1193,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "anthropic/claude-2.0",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T21:53:57.326Z",
|
||||
"passed": false,
|
||||
"duration": 968,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:45.316Z",
|
||||
"passed": false,
|
||||
"duration": 1753,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:51.577Z",
|
||||
"passed": false,
|
||||
"duration": 1242,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:57.503Z",
|
||||
"passed": false,
|
||||
"duration": 5925,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:59.075Z",
|
||||
"passed": false,
|
||||
"duration": 13757,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:59.249Z",
|
||||
"passed": false,
|
||||
"duration": 2656,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:30:03.313Z",
|
||||
"passed": false,
|
||||
"duration": 4063,
|
||||
"reason": "Unknown error occurred"
|
||||
}
|
||||
]
|
||||
@ -1,202 +1,88 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { run } from '../../src/index'
|
||||
import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import {
|
||||
models,
|
||||
getDefaultModels,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
runTest,
|
||||
generateTestReport,
|
||||
getReportPaths
|
||||
} from './commons'
|
||||
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
// Optionally override models for this specific test file
|
||||
const models = getDefaultModels()
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './language.json')
|
||||
|
||||
describe('Language Capabilities', () => {
|
||||
describe('Language Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
if (exists(TEST_LOG_PATH)) {
|
||||
const data = read(TEST_LOG_PATH, 'json')
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
const TEST_LOG_PATH = getReportPaths('language', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('language', 'md')
|
||||
|
||||
const runTranslationTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected && !isEmptyResponse(result)
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
return {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
throw e
|
||||
} finally {
|
||||
const testResult: TestResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred'
|
||||
}
|
||||
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
|
||||
it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
|
||||
await runTranslationTest(
|
||||
'translate "hello" to German. Return only the translated word, no explanation.',
|
||||
'hallo',
|
||||
'german',
|
||||
modelName
|
||||
it.each(models)('should translate text with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Translate "Hello, world!" to Spanish. Return only the translation, no explanation.',
|
||||
'¡Hola, mundo!',
|
||||
'translation',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
})
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('¡hola, mundo!')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => {
|
||||
await runTranslationTest(
|
||||
'translate "yes" to Spanish. Return only the translated word, no explanation.',
|
||||
'sí',
|
||||
'spanish',
|
||||
modelName
|
||||
it.each(models)('should correct grammar with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.',
|
||||
'I went to the store yesterday',
|
||||
'grammar',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
})
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('i went to the store yesterday')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should translate "no" to French with model %s', async (modelName) => {
|
||||
await runTranslationTest(
|
||||
'translate "no" to French. Return only the translated word, no explanation.',
|
||||
'non',
|
||||
'french',
|
||||
modelName
|
||||
it.each(models)('should summarize text with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Summarize: "The quick brown fox jumps over the lazy dog". Return only the summary, no explanation.',
|
||||
'A fox jumps over a dog',
|
||||
'summarization',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
})
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('a fox jumps over a dog')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should identify language with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.',
|
||||
'French',
|
||||
'language_detection',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('french')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should generate synonyms with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Provide a synonym for "happy". Return only the synonym, no explanation.',
|
||||
'joyful',
|
||||
'synonyms',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('joyful')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Language Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './language-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
generateTestReport(testResults, 'Language Operations Test Results', TEST_REPORT_PATH)
|
||||
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -1,185 +0,0 @@
|
||||
# Math Test Results
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### addition - deepseek/deepseek-chat:free
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 913ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:42:47 PM
|
||||
|
||||
### addition - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 1112ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:45:50 PM
|
||||
|
||||
### addition - gpt-4
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 1038ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:42:26 PM
|
||||
|
||||
### addition - anthropic/claude-2.0
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 1992ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:40 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-chat:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: ``
|
||||
- Duration: 636ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:42:48 PM
|
||||
|
||||
### multiplication - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: ``
|
||||
- Duration: 764ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:45:51 PM
|
||||
|
||||
### multiplication - gpt-4
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: ``
|
||||
- Duration: 1052ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:42:28 PM
|
||||
|
||||
### multiplication - anthropic/claude-2.0
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: ``
|
||||
- Duration: 1078ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:41 PM
|
||||
|
||||
### division - deepseek/deepseek-chat:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: ``
|
||||
- Duration: 648ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:42:50 PM
|
||||
|
||||
### division - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: ``
|
||||
- Duration: 829ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:45:52 PM
|
||||
|
||||
### division - gpt-4
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: ``
|
||||
- Duration: 959ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:42:31 PM
|
||||
|
||||
### division - anthropic/claude-2.0
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: ``
|
||||
- Duration: 940ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:42 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### factorial - deepseek/deepseek-chat:free
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:59:09 PM
|
||||
|
||||
### factorial - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:59:10 PM
|
||||
|
||||
### factorial - gpt-4
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 1:03:25 PM
|
||||
|
||||
### fibonacci - deepseek/deepseek-chat:free
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0,1,1,2,3`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:59:13 PM
|
||||
|
||||
### fibonacci - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0,1,1,2,3
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:59:14 PM
|
||||
|
||||
### fibonacci - gpt-4
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0, 1, 1, 2, 3`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 1:03:27 PM
|
||||
|
||||
### quadratic - deepseek/deepseek-chat:free
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `[-2, -3]`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:59:19 PM
|
||||
|
||||
### quadratic - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `[-2, -3]`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:46:13 PM
|
||||
|
||||
### quadratic - gpt-4
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `[-2, -3]`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 1:03:30 PM
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,201 +1,88 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { run } from '../../src/index'
|
||||
import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import {
|
||||
models,
|
||||
getDefaultModels,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
runTest,
|
||||
generateTestReport,
|
||||
getReportPaths
|
||||
} from './commons'
|
||||
|
||||
// Optionally override models for this specific test file
|
||||
const models = getDefaultModels()
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './math.json')
|
||||
|
||||
describe('Math Capabilities', () => {
|
||||
describe('Math Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
if (exists(TEST_LOG_PATH)) {
|
||||
const data = read(TEST_LOG_PATH, 'json')
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
const TEST_LOG_PATH = getReportPaths('math', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('math', 'md')
|
||||
|
||||
const runMathTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
it.each(models)('should solve quadratic equation with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.',
|
||||
'-3,-2',
|
||||
'quadratic',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('-3,-2')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected && !isEmptyResponse(result)
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
it.each(models)('should calculate factorial with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Calculate 5! (factorial of 5). Return only the number, no explanation.',
|
||||
'120',
|
||||
'factorial',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('120')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
return {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
throw e
|
||||
} finally {
|
||||
const testResult: TestResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred'
|
||||
}
|
||||
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
|
||||
it.each(models)('should add two numbers with model %s', async (modelName) => {
|
||||
await runMathTest(
|
||||
'add 5 and 3. Return only the number, no explanation.',
|
||||
it.each(models)('should calculate fibonacci sequence with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.',
|
||||
'8',
|
||||
'addition',
|
||||
modelName
|
||||
'fibonacci',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
})
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
await runMathTest(
|
||||
'multiply 8 and 3. Return only the number, no explanation.',
|
||||
'24',
|
||||
'multiplication',
|
||||
modelName
|
||||
it.each(models)('should calculate square root with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Calculate the square root of 16. Return only the number, no explanation.',
|
||||
'4',
|
||||
'square_root',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
})
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('4')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should divide two numbers with model %s', async (modelName) => {
|
||||
await runMathTest(
|
||||
'divide 15 by 3. Return only the number, no explanation.',
|
||||
'5',
|
||||
'division',
|
||||
modelName
|
||||
it.each(models)('should calculate power with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Calculate 2 raised to the power of 3. Return only the number, no explanation.',
|
||||
'8',
|
||||
'power',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
})
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8')
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Math Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './math-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
generateTestReport(testResults, 'Math Operations Test Results', TEST_REPORT_PATH)
|
||||
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
208
packages/kbot/tests/unit/reports/basic.json
Normal file
208
packages/kbot/tests/unit/reports/basic.json
Normal file
@ -0,0 +1,208 @@
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:56:09.502Z",
|
||||
"passed": true,
|
||||
"duration": 1237
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:56:13.802Z",
|
||||
"passed": true,
|
||||
"duration": 4298
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:56:15.214Z",
|
||||
"passed": true,
|
||||
"duration": 1411
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:56:18.337Z",
|
||||
"passed": true,
|
||||
"duration": 3122
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:56:18.922Z",
|
||||
"passed": true,
|
||||
"duration": 583
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:56:22.539Z",
|
||||
"passed": true,
|
||||
"duration": 3615
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:01:08.904Z",
|
||||
"passed": true,
|
||||
"duration": 1888
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:01:15.210Z",
|
||||
"passed": true,
|
||||
"duration": 6304
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:01:16.502Z",
|
||||
"passed": true,
|
||||
"duration": 1291
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:01:18.728Z",
|
||||
"passed": true,
|
||||
"duration": 2225
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:01:19.938Z",
|
||||
"passed": true,
|
||||
"duration": 1209
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:01:27.791Z",
|
||||
"passed": true,
|
||||
"duration": 7852
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
{
|
||||
"test": "addition",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1888,
|
||||
"duration_secs": 1.888
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 6304,
|
||||
"duration_secs": 6.304
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1291,
|
||||
"duration_secs": 1.291
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 2225,
|
||||
"duration_secs": 2.225
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1209,
|
||||
"duration_secs": 1.209
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 7852,
|
||||
"duration_secs": 7.852
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T11:01:27.792Z"
|
||||
}
|
||||
@ -3,16 +3,16 @@
|
||||
## Highscores
|
||||
|
||||
### addition
|
||||
1. anthropic/claude-3.5-sonnet: 1278ms (1.28s)
|
||||
2. qwen/qwq-32b: 6285ms (6.29s)
|
||||
1. anthropic/claude-3.5-sonnet: 1237ms (1.24s)
|
||||
2. qwen/qwq-32b: 4298ms (4.30s)
|
||||
|
||||
### multiplication
|
||||
1. anthropic/claude-3.5-sonnet: 615ms (0.61s)
|
||||
2. qwen/qwq-32b: 9610ms (9.61s)
|
||||
1. anthropic/claude-3.5-sonnet: 1411ms (1.41s)
|
||||
2. qwen/qwq-32b: 3122ms (3.12s)
|
||||
|
||||
### division
|
||||
1. anthropic/claude-3.5-sonnet: 1242ms (1.24s)
|
||||
2. qwen/qwq-32b: 4040ms (4.04s)
|
||||
1. anthropic/claude-3.5-sonnet: 583ms (0.58s)
|
||||
2. qwen/qwq-32b: 3615ms (3.62s)
|
||||
|
||||
## Summary
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
- Passed: 6
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Average Duration: 3845ms (3.85s)
|
||||
- Average Duration: 2378ms (2.38s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
@ -32,41 +32,41 @@
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1278ms (1278.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:09 PM
|
||||
- Duration: 1237ms (1237.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:09 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 6285ms (6285.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:15 PM
|
||||
- Duration: 4298ms (4298.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:13 PM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 615ms (615.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:16 PM
|
||||
- Duration: 1411ms (1411.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:15 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 9610ms (9610.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:25 PM
|
||||
- Duration: 3122ms (3122.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:18 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1242ms (1242.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:27 PM
|
||||
- Duration: 583ms (583.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:18 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 4040ms (4040.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:31 PM
|
||||
- Duration: 3615ms (3615.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:22 PM
|
||||
|
||||
72
packages/kbot/tests/unit/reports/basic.md
Normal file
72
packages/kbot/tests/unit/reports/basic.md
Normal file
@ -0,0 +1,72 @@
|
||||
# Basic Operations Test Results
|
||||
|
||||
## Highscores
|
||||
|
||||
### addition
|
||||
1. anthropic/claude-3.5-sonnet: 1888ms (1.89s)
|
||||
2. qwen/qwq-32b: 6304ms (6.30s)
|
||||
|
||||
### multiplication
|
||||
1. anthropic/claude-3.5-sonnet: 1291ms (1.29s)
|
||||
2. qwen/qwq-32b: 2225ms (2.23s)
|
||||
|
||||
### division
|
||||
1. anthropic/claude-3.5-sonnet: 1209ms (1.21s)
|
||||
2. qwen/qwq-32b: 7852ms (7.85s)
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 6
|
||||
- Passed: 6
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Average Duration: 3462ms (3.46s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
*No failed tests*
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### addition - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1888ms (1888.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:08 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 6304ms (6304.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:15 PM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1291ms (1291.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:16 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 2225ms (2225.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:18 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1209ms (1209.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:19 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 7852ms (7852.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:27 PM
|
||||
|
||||
Loading…
Reference in New Issue
Block a user