This commit is contained in:
lovebird 2025-04-02 12:49:47 +02:00
parent 2afc1a8051
commit 856ffe680f
11 changed files with 997 additions and 298 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,9 @@
{
"model": "google/gemini-2.0-flash-lite-001",
"model": "qwen/qwq-32b",
"messages": [
{
"role": "user",
"content": "Generate a random number"
"content": "divide 15 by 3. Return only the number, no explanation."
},
{
"role": "user",

View File

@ -1,5 +1,27 @@
# Basic Operations Test Results
## Highscores
### addition
1. anthropic/claude-3.5-sonnet: 1278ms (1.28s)
2. qwen/qwq-32b: 6285ms (6.29s)
### multiplication
1. anthropic/claude-3.5-sonnet: 615ms (0.61s)
2. qwen/qwq-32b: 9610ms (9.61s)
### division
1. anthropic/claude-3.5-sonnet: 1242ms (1.24s)
2. qwen/qwq-32b: 4040ms (4.04s)
## Summary
- Total Tests: 6
- Passed: 6
- Failed: 0
- Success Rate: 100.00%
- Average Duration: 3845ms (3.85s)
## Failed Tests
*No failed tests*
@ -10,48 +32,41 @@
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1551ms
- Timestamp: 4/2/2025, 12:17:39 AM
- Duration: 1278ms (1278.00s)
- Timestamp: 4/2/2025, 12:49:09 PM
### addition - qwen/qwq-32b
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 3621ms
- Timestamp: 4/2/2025, 12:17:42 AM
- Duration: 6285ms (6285.00s)
- Timestamp: 4/2/2025, 12:49:15 PM
### multiplication - anthropic/claude-3.5-sonnet
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 873ms
- Timestamp: 4/2/2025, 12:17:43 AM
- Duration: 615ms (615.00s)
- Timestamp: 4/2/2025, 12:49:16 PM
### multiplication - qwen/qwq-32b
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 3472ms
- Timestamp: 4/2/2025, 12:17:47 AM
- Duration: 9610ms (9610.00s)
- Timestamp: 4/2/2025, 12:49:25 PM
### division - anthropic/claude-3.5-sonnet
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1183ms
- Timestamp: 4/2/2025, 12:17:48 AM
- Duration: 1242ms (1242.00s)
- Timestamp: 4/2/2025, 12:49:27 PM
### division - qwen/qwq-32b
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 4841ms
- Timestamp: 4/2/2025, 12:17:53 AM
## Summary
- Total Tests: 6
- Passed: 6
- Failed: 0
- Success Rate: 100.00%
- Duration: 4040ms (4040.00s)
- Timestamp: 4/2/2025, 12:49:31 PM

View File

@ -1,80 +1,541 @@
[
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:39.340Z",
"passed": true,
"duration": 1551
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:42.962Z",
"passed": true,
"duration": 3621
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:43.836Z",
"passed": true,
"duration": 873
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:47.309Z",
"passed": true,
"duration": 3472
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:48.493Z",
"passed": true,
"duration": 1183
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:53.335Z",
"passed": true,
"duration": 4841
}
]
{
"results": [
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:39.340Z",
"passed": true,
"duration": 1551
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:42.962Z",
"passed": true,
"duration": 3621
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:43.836Z",
"passed": true,
"duration": 873
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:47.309Z",
"passed": true,
"duration": 3472
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:48.493Z",
"passed": true,
"duration": 1183
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-01T22:17:53.335Z",
"passed": true,
"duration": 4841
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:38:37.069Z",
"passed": true,
"duration": 1256
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:38:40.167Z",
"passed": false,
"duration": 3096,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error",
"details": {
"stack": "Error: Model returned empty response\n at Module.runTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:85:13)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at __vite_ssr_import_0__.it.each.timeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\basic.test.ts:21:20)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
"message": "Model returned empty response"
}
},
"reason": "Model returned empty response"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:38:41.086Z",
"passed": true,
"duration": 916
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:38:45.879Z",
"passed": true,
"duration": 4793
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:38:46.900Z",
"passed": true,
"duration": 1020
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:38:50.446Z",
"passed": true,
"duration": 3545
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:39:58.836Z",
"passed": true,
"duration": 1266
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:40:02.777Z",
"passed": true,
"duration": 3939
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:40:03.961Z",
"passed": true,
"duration": 1183
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:40:06.962Z",
"passed": true,
"duration": 3000
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:40:08.115Z",
"passed": true,
"duration": 1152
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:40:12.565Z",
"passed": true,
"duration": 4449
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:41:52.176Z",
"passed": true,
"duration": 1458
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:41:55.869Z",
"passed": true,
"duration": 3691
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:41:57.106Z",
"passed": true,
"duration": 1236
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:41:59.974Z",
"passed": true,
"duration": 2867
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:42:01.272Z",
"passed": true,
"duration": 1297
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:42:04.326Z",
"passed": true,
"duration": 3053
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:44:39.002Z",
"passed": true,
"duration": 1196
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:44:48.668Z",
"passed": true,
"duration": 9664
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:44:49.806Z",
"passed": true,
"duration": 1137
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:44:53.017Z",
"passed": true,
"duration": 3210
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:44:53.814Z",
"passed": true,
"duration": 796
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:44:55.383Z",
"passed": true,
"duration": 1568
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:47:00.955Z",
"passed": true,
"duration": 1297
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:49:09.343Z",
"passed": true,
"duration": 1278
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:49:15.630Z",
"passed": true,
"duration": 6285
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:49:16.246Z",
"passed": true,
"duration": 615
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:49:25.857Z",
"passed": true,
"duration": 9610
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T10:49:27.101Z",
"passed": true,
"duration": 1242
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T10:49:31.142Z",
"passed": true,
"duration": 4040
}
],
"highscores": [
{
"test": "addition",
"rankings": [
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1278,
"duration_secs": 1.278
},
{
"model": "qwen/qwq-32b",
"duration": 6285,
"duration_secs": 6.285
}
]
},
{
"test": "multiplication",
"rankings": [
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 615,
"duration_secs": 0.615
},
{
"model": "qwen/qwq-32b",
"duration": 9610,
"duration_secs": 9.61
}
]
},
{
"test": "division",
"rankings": [
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1242,
"duration_secs": 1.242
},
{
"model": "qwen/qwq-32b",
"duration": 4040,
"duration_secs": 4.04
}
]
}
],
"lastUpdated": "2025-04-02T10:49:31.142Z"
}

View File

@ -1,8 +1,5 @@
import { describe, it, expect } from 'vitest'
import { run } from '../../src/index'
import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import {
models,
@ -11,200 +8,54 @@ import {
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse
runTest,
generateTestReport
} from './commons'
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
describe('Basic Operations', () => {
let testResults: TestResult[] = []
const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
testResult = {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
}
} catch (e) {
error = formatError(e)
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred'
}
throw e
} finally {
if (testResult) {
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
}
it.each(models)('should add two numbers with model %s', async (modelName) => {
await runBasicTest(
const result = await runTest(
'add 5 and 3. Return only the number, no explanation.',
'8',
'addition',
modelName
modelName,
TEST_LOG_PATH
)
testResults.push(result)
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8')
}, { timeout: 10000 })
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
await runBasicTest(
const result = await runTest(
'multiply 8 and 3. Return only the number, no explanation.',
'24',
'multiplication',
modelName
modelName,
TEST_LOG_PATH
)
testResults.push(result)
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('24')
}, { timeout: 10000 })
it.each(models)('should divide two numbers with model %s', async (modelName) => {
await runBasicTest(
const result = await runTest(
'divide 15 by 3. Return only the number, no explanation.',
'5',
'division',
modelName
modelName,
TEST_LOG_PATH
)
testResults.push(result)
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('5')
}, { timeout: 10000 })
it('should generate markdown report', () => {
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
// Get only the latest result for each test+model combination
testResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate markdown report
let report = '# Basic Operations Test Results\n\n'
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
if (result.error.details?.message) {
report += `- Error Details: ${result.error.details.message}\n`
}
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Add summary section
report += '## Summary\n\n'
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
// Write report to file
const reportPath = path.resolve(__dirname, './basic-report.md')
write(reportPath, report)
// Verify report was written
generateTestReport(testResults, 'Basic Operations Test Results', reportPath)
expect(exists(reportPath) === 'file').toBe(true)
})
})

View File

@ -1,5 +1,9 @@
import * as path from 'node:path'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL } from '../../src/index'
import { run } from '../../src/index'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
export const models = [
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
@ -31,6 +35,15 @@ export interface TestResult {
duration?: number
}
export interface TestHighscore {
test: string;
rankings: {
model: string;
duration: number;
duration_secs: number;
}[];
}
export const formatError = (error: any): TestResult['error'] => {
return {
message: error?.message || 'Unknown error',
@ -43,3 +56,251 @@ export const formatError = (error: any): TestResult['error'] => {
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
}
export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
const highscores: TestHighscore[] = []
for (const [testName, modelResults] of latestResults) {
// Convert model results to array and sort by duration
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({ model, result }))
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
.slice(0, 2) // Get top 2
if (sortedResults.length > 0) {
highscores.push({
test: testName,
rankings: sortedResults.map(({ model, result }) => ({
model,
duration: result.duration || 0,
duration_secs: (result.duration || 0) / 1000
}))
})
}
}
return highscores
}
export const runTest = async (
prompt: string,
expected: string,
testName: string,
modelName: string,
logPath: string
): Promise<TestResult> => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
if (isEmptyResponse(result)) {
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
reason: 'Model returned empty response'
}
} else {
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
testResult = {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
}
}
} catch (e) {
error = formatError(e)
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred'
}
throw e
} finally {
if (testResult) {
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath)) : { results: [], highscores: [] }
const updatedResults = [...(existingData.results || []), testResult]
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
updatedResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate highscores
const highscores = generateHighscores(latestResults)
// Write both results and highscores
write(logPath, JSON.stringify({
results: updatedResults,
highscores,
lastUpdated: new Date().toISOString()
}, null, 2))
}
}
return testResult
}
export const generateTestReport = (
testResults: TestResult[],
reportTitle: string,
reportPath: string
): void => {
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
// Get only the latest result for each test+model combination
testResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate markdown report
let report = `# ${reportTitle}\n\n`
// Add highscore section
report += '## Highscores\n\n'
for (const [testName, modelResults] of latestResults) {
report += `### ${testName}\n`
// Convert model results to array and sort by duration
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({ model, result }))
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
.slice(0, 2) // Get top 2
if (sortedResults.length > 0) {
sortedResults.forEach(({ model, result }, index) => {
const duration = result.duration || 0
report += `${index + 1}. ${model}: ${duration}ms (${(duration / 1000).toFixed(2)}s)\n`
})
} else {
report += '*No results available*\n'
}
report += '\n'
}
// Add summary section
report += '## Summary\n\n'
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
if (result.error.details?.message) {
report += `- Error Details: ${result.error.details.message}\n`
}
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
write(reportPath, report)
}

View File

@ -6,21 +6,14 @@
## Passed Tests
### json-schema-file-format - google/gemini-2.0-flash-lite-001
### json-schema-file-format - mistralai/mistral-tiny
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
- Actual: ````json
{
"name": "John Doe",
"age": 30,
"tags": ["developer", "javascript"]
}
```
`
- Duration: 1122ms
- Timestamp: 4/2/2025, 9:03:33 AM
- Actual: `{"name": "John Doe", "age": 30, "tags": ["developer", "javascript"]}`
- Duration: 982ms
- Timestamp: 4/2/2025, 10:26:33 AM
### json-schema-object-format - google/gemini-2.0-flash-lite-001
### json-schema-object-format - mistralai/mistral-tiny
- Prompt: `Create a user profile with the following details:
- Name: Jane Smith
- Age: 25
@ -39,8 +32,7 @@
}
Return only the JSON object, no explanation.`
- Expected: `{"name":"Jane Smith","age":25,"email":"jane.smith@company.com","tags":["developer","designer"],"address":{"street":"123 Main St","city":"New York","country":"US","postal_code":"10001"},"preferences":{"theme":"light","notifications":"enabled","language":"English"}}`
- Actual: ````json
{
- Actual: `{
"name": "Jane Smith",
"age": 25,
"email": "jane.smith@company.com",
@ -51,16 +43,14 @@
"country": "US",
"postal_code": "10001"
},
"preferences": {
"Preferences": {
"theme": "light",
"notifications": "enabled",
"language": "English"
}
}
```
`
- Duration: 1289ms
- Timestamp: 4/2/2025, 9:03:34 AM
}`
- Duration: 1673ms
- Timestamp: 4/2/2025, 10:26:35 AM
## Summary

View File

@ -3,26 +3,26 @@
"test": "json-schema-file-format",
"prompt": "Create a user profile with name John Doe, age 30, and tags [\"developer\", \"javascript\"]. Return only the JSON object, no explanation.",
"result": [
"```json\n{\n \"name\": \"John Doe\",\n \"age\": 30,\n \"tags\": [\"developer\", \"javascript\"]\n}\n```\n"
"{\"name\": \"John Doe\", \"age\": 30, \"tags\": [\"developer\", \"javascript\"]}"
],
"expected": "{\"name\":\"John Doe\",\"age\":30,\"tags\":[\"developer\",\"javascript\"]}",
"model": "google/gemini-2.0-flash-lite-001",
"model": "mistralai/mistral-tiny",
"router": "openrouter",
"timestamp": "2025-04-02T07:03:33.038Z",
"timestamp": "2025-04-02T08:26:33.513Z",
"passed": true,
"duration": 1122
"duration": 982
},
{
"test": "json-schema-object-format",
"prompt": "Create a user profile with the following details:\n - Name: Jane Smith\n - Age: 25\n - Email: jane.smith@company.com\n - Tags: [\"developer\", \"designer\"]\n - Address: {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n }\n - Preferences: {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n Return only the JSON object, no explanation.",
"result": [
"```json\n{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}\n```\n"
"{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"Preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}"
],
"expected": "{\"name\":\"Jane Smith\",\"age\":25,\"email\":\"jane.smith@company.com\",\"tags\":[\"developer\",\"designer\"],\"address\":{\"street\":\"123 Main St\",\"city\":\"New York\",\"country\":\"US\",\"postal_code\":\"10001\"},\"preferences\":{\"theme\":\"light\",\"notifications\":\"enabled\",\"language\":\"English\"}}",
"model": "google/gemini-2.0-flash-lite-001",
"model": "mistralai/mistral-tiny",
"router": "openrouter",
"timestamp": "2025-04-02T07:03:34.328Z",
"timestamp": "2025-04-02T08:26:35.187Z",
"passed": true,
"duration": 1289
"duration": 1673
}
]

View File

@ -17,7 +17,8 @@ import {
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
const TEST_SCHEMA_PATH = path.resolve(__dirname, './test-schema.json')
const TEST_MODEL = 'google/gemini-2.0-flash-lite-001'
const TEST_MODEL_FAST = 'mistralai/codestral-2501'
const TEST_MODEL = 'mistralai/mistral-tiny'
const TEST_ROUTER = 'openrouter'
// Sample JSON Schema for testing

View File

@ -40,6 +40,22 @@
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 11:53:55 PM
### german - anthropic/claude-3.5-sonnet
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: ``
- Duration: 1753ms
- Reason: Unknown error occurred
- Timestamp: 4/2/2025, 10:29:45 AM
### german - qwen/qwq-32b
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: ``
- Duration: 13757ms
- Reason: Unknown error occurred
- Timestamp: 4/2/2025, 10:29:59 AM
### spanish - deepseek/deepseek-chat:free
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
@ -78,6 +94,22 @@
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 11:53:56 PM
### spanish - anthropic/claude-3.5-sonnet
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: ``
- Duration: 1242ms
- Reason: Unknown error occurred
- Timestamp: 4/2/2025, 10:29:51 AM
### spanish - qwen/qwq-32b
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: ``
- Duration: 5925ms
- Reason: Unknown error occurred
- Timestamp: 4/2/2025, 10:29:57 AM
### french - deepseek/deepseek-chat:free
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
@ -116,6 +148,22 @@
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 11:53:57 PM
### french - anthropic/claude-3.5-sonnet
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: ``
- Duration: 2656ms
- Reason: Unknown error occurred
- Timestamp: 4/2/2025, 10:29:59 AM
### french - qwen/qwq-32b
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: ``
- Duration: 4063ms
- Reason: Unknown error occurred
- Timestamp: 4/2/2025, 10:30:03 AM
## Passed Tests
### german_translation - deepseek/deepseek-chat:free

View File

@ -843,5 +843,77 @@
"passed": false,
"duration": 968,
"reason": "Unknown error occurred"
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [],
"expected": "hallo",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T08:29:45.316Z",
"passed": false,
"duration": 1753,
"reason": "Unknown error occurred"
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [],
"expected": "sí",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T08:29:51.577Z",
"passed": false,
"duration": 1242,
"reason": "Unknown error occurred"
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [],
"expected": "sí",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T08:29:57.503Z",
"passed": false,
"duration": 5925,
"reason": "Unknown error occurred"
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [],
"expected": "hallo",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T08:29:59.075Z",
"passed": false,
"duration": 13757,
"reason": "Unknown error occurred"
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [],
"expected": "non",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T08:29:59.249Z",
"passed": false,
"duration": 2656,
"reason": "Unknown error occurred"
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [],
"expected": "non",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T08:30:03.313Z",
"passed": false,
"duration": 4063,
"reason": "Unknown error occurred"
}
]