From 856ffe680f564ee88fdc157a89fed605e9d4f273 Mon Sep 17 00:00:00 2001 From: babayaga Date: Wed, 2 Apr 2025 12:49:47 +0200 Subject: [PATCH] latest --- packages/kbot/dist-in/zod_schema.js | 2 +- packages/kbot/logs/params.json | 4 +- packages/kbot/tests/unit/basic-report.md | 53 +- packages/kbot/tests/unit/basic.json | 621 +++++++++++++++++--- packages/kbot/tests/unit/basic.test.ts | 185 +----- packages/kbot/tests/unit/commons.ts | 261 ++++++++ packages/kbot/tests/unit/format-report.md | 30 +- packages/kbot/tests/unit/format.json | 16 +- packages/kbot/tests/unit/format.test.ts | 3 +- packages/kbot/tests/unit/language-report.md | 48 ++ packages/kbot/tests/unit/language.json | 72 +++ 11 files changed, 997 insertions(+), 298 deletions(-) diff --git a/packages/kbot/dist-in/zod_schema.js b/packages/kbot/dist-in/zod_schema.js index 59ca5ff3..ecb62ca3 100644 --- a/packages/kbot/dist-in/zod_schema.js +++ b/packages/kbot/dist-in/zod_schema.js @@ -212,4 +212,4 @@ export const schemas = () => { write([OptionsSchema()], 'schema.json', 'kbot', {}); writeFS('schema_ui.json', schemaMap.getUISchema()); }; -//# sourceMappingURL=data:application/json;base64, \ No newline at end of file +//# sourceMappingURL=data:application/json;base64, \ No newline at end of file diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json index 15527994..605ca4cd 100644 --- a/packages/kbot/logs/params.json +++ b/packages/kbot/logs/params.json @@ -1,9 +1,9 @@ { - "model": "google/gemini-2.0-flash-lite-001", + "model": "qwen/qwq-32b", "messages": [ { "role": "user", - "content": "Generate a random number" + "content": "divide 15 by 3. Return only the number, no explanation." }, { "role": "user", diff --git a/packages/kbot/tests/unit/basic-report.md b/packages/kbot/tests/unit/basic-report.md index d94a4807..9903ba60 100644 --- a/packages/kbot/tests/unit/basic-report.md +++ b/packages/kbot/tests/unit/basic-report.md @@ -1,5 +1,27 @@ # Basic Operations Test Results +## Highscores + +### addition +1. anthropic/claude-3.5-sonnet: 1278ms (1.28s) +2. qwen/qwq-32b: 6285ms (6.29s) + +### multiplication +1. anthropic/claude-3.5-sonnet: 615ms (0.61s) +2. qwen/qwq-32b: 9610ms (9.61s) + +### division +1. anthropic/claude-3.5-sonnet: 1242ms (1.24s) +2. qwen/qwq-32b: 4040ms (4.04s) + +## Summary + +- Total Tests: 6 +- Passed: 6 +- Failed: 0 +- Success Rate: 100.00% +- Average Duration: 3845ms (3.85s) + ## Failed Tests *No failed tests* @@ -10,48 +32,41 @@ - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 1551ms -- Timestamp: 4/2/2025, 12:17:39 AM +- Duration: 1278ms (1278.00s) +- Timestamp: 4/2/2025, 12:49:09 PM ### addition - qwen/qwq-32b - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 3621ms -- Timestamp: 4/2/2025, 12:17:42 AM +- Duration: 6285ms (6285.00s) +- Timestamp: 4/2/2025, 12:49:15 PM ### multiplication - anthropic/claude-3.5-sonnet - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Duration: 873ms -- Timestamp: 4/2/2025, 12:17:43 AM +- Duration: 615ms (615.00s) +- Timestamp: 4/2/2025, 12:49:16 PM ### multiplication - qwen/qwq-32b - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Duration: 3472ms -- Timestamp: 4/2/2025, 12:17:47 AM +- Duration: 9610ms (9610.00s) +- Timestamp: 4/2/2025, 12:49:25 PM ### division - anthropic/claude-3.5-sonnet - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Duration: 1183ms -- Timestamp: 4/2/2025, 12:17:48 AM +- Duration: 1242ms (1242.00s) +- Timestamp: 4/2/2025, 12:49:27 PM ### division - qwen/qwq-32b - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Duration: 4841ms -- Timestamp: 4/2/2025, 12:17:53 AM - -## Summary - -- Total Tests: 6 -- Passed: 6 -- Failed: 0 -- Success Rate: 100.00% +- Duration: 4040ms (4040.00s) +- Timestamp: 4/2/2025, 12:49:31 PM diff --git a/packages/kbot/tests/unit/basic.json b/packages/kbot/tests/unit/basic.json index cd6be239..5e5bd5ea 100644 --- a/packages/kbot/tests/unit/basic.json +++ b/packages/kbot/tests/unit/basic.json @@ -1,80 +1,541 @@ -[ - { - "test": "addition", - "prompt": "add 5 and 3. Return only the number, no explanation.", - "result": [ - "8" - ], - "expected": "8", - "model": "anthropic/claude-3.5-sonnet", - "router": "openrouter", - "timestamp": "2025-04-01T22:17:39.340Z", - "passed": true, - "duration": 1551 - }, - { - "test": "addition", - "prompt": "add 5 and 3. Return only the number, no explanation.", - "result": [ - "8" - ], - "expected": "8", - "model": "qwen/qwq-32b", - "router": "openrouter", - "timestamp": "2025-04-01T22:17:42.962Z", - "passed": true, - "duration": 3621 - }, - { - "test": "multiplication", - "prompt": "multiply 8 and 3. Return only the number, no explanation.", - "result": [ - "24" - ], - "expected": "24", - "model": "anthropic/claude-3.5-sonnet", - "router": "openrouter", - "timestamp": "2025-04-01T22:17:43.836Z", - "passed": true, - "duration": 873 - }, - { - "test": "multiplication", - "prompt": "multiply 8 and 3. Return only the number, no explanation.", - "result": [ - "24" - ], - "expected": "24", - "model": "qwen/qwq-32b", - "router": "openrouter", - "timestamp": "2025-04-01T22:17:47.309Z", - "passed": true, - "duration": 3472 - }, - { - "test": "division", - "prompt": "divide 15 by 3. Return only the number, no explanation.", - "result": [ - "5" - ], - "expected": "5", - "model": "anthropic/claude-3.5-sonnet", - "router": "openrouter", - "timestamp": "2025-04-01T22:17:48.493Z", - "passed": true, - "duration": 1183 - }, - { - "test": "division", - "prompt": "divide 15 by 3. Return only the number, no explanation.", - "result": [ - "5" - ], - "expected": "5", - "model": "qwen/qwq-32b", - "router": "openrouter", - "timestamp": "2025-04-01T22:17:53.335Z", - "passed": true, - "duration": 4841 - } -] \ No newline at end of file +{ + "results": [ + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-01T22:17:39.340Z", + "passed": true, + "duration": 1551 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-01T22:17:42.962Z", + "passed": true, + "duration": 3621 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-01T22:17:43.836Z", + "passed": true, + "duration": 873 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-01T22:17:47.309Z", + "passed": true, + "duration": 3472 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-01T22:17:48.493Z", + "passed": true, + "duration": 1183 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-01T22:17:53.335Z", + "passed": true, + "duration": 4841 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:38:37.069Z", + "passed": true, + "duration": 1256 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:38:40.167Z", + "passed": false, + "duration": 3096, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error", + "details": { + "stack": "Error: Model returned empty response\n at Module.runTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:85:13)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at __vite_ssr_import_0__.it.each.timeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\basic.test.ts:21:20)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11", + "message": "Model returned empty response" + } + }, + "reason": "Model returned empty response" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:38:41.086Z", + "passed": true, + "duration": 916 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:38:45.879Z", + "passed": true, + "duration": 4793 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:38:46.900Z", + "passed": true, + "duration": 1020 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:38:50.446Z", + "passed": true, + "duration": 3545 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:39:58.836Z", + "passed": true, + "duration": 1266 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:40:02.777Z", + "passed": true, + "duration": 3939 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:40:03.961Z", + "passed": true, + "duration": 1183 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:40:06.962Z", + "passed": true, + "duration": 3000 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:40:08.115Z", + "passed": true, + "duration": 1152 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:40:12.565Z", + "passed": true, + "duration": 4449 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:41:52.176Z", + "passed": true, + "duration": 1458 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:41:55.869Z", + "passed": true, + "duration": 3691 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:41:57.106Z", + "passed": true, + "duration": 1236 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:41:59.974Z", + "passed": true, + "duration": 2867 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:42:01.272Z", + "passed": true, + "duration": 1297 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:42:04.326Z", + "passed": true, + "duration": 3053 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:44:39.002Z", + "passed": true, + "duration": 1196 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:44:48.668Z", + "passed": true, + "duration": 9664 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:44:49.806Z", + "passed": true, + "duration": 1137 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:44:53.017Z", + "passed": true, + "duration": 3210 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:44:53.814Z", + "passed": true, + "duration": 796 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:44:55.383Z", + "passed": true, + "duration": 1568 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:47:00.955Z", + "passed": true, + "duration": 1297 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:49:09.343Z", + "passed": true, + "duration": 1278 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:49:15.630Z", + "passed": true, + "duration": 6285 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:49:16.246Z", + "passed": true, + "duration": 615 + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:49:25.857Z", + "passed": true, + "duration": 9610 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T10:49:27.101Z", + "passed": true, + "duration": 1242 + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T10:49:31.142Z", + "passed": true, + "duration": 4040 + } + ], + "highscores": [ + { + "test": "addition", + "rankings": [ + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 1278, + "duration_secs": 1.278 + }, + { + "model": "qwen/qwq-32b", + "duration": 6285, + "duration_secs": 6.285 + } + ] + }, + { + "test": "multiplication", + "rankings": [ + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 615, + "duration_secs": 0.615 + }, + { + "model": "qwen/qwq-32b", + "duration": 9610, + "duration_secs": 9.61 + } + ] + }, + { + "test": "division", + "rankings": [ + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 1242, + "duration_secs": 1.242 + }, + { + "model": "qwen/qwq-32b", + "duration": 4040, + "duration_secs": 4.04 + } + ] + } + ], + "lastUpdated": "2025-04-02T10:49:31.142Z" +} \ No newline at end of file diff --git a/packages/kbot/tests/unit/basic.test.ts b/packages/kbot/tests/unit/basic.test.ts index dd1b3eb7..5022cc84 100644 --- a/packages/kbot/tests/unit/basic.test.ts +++ b/packages/kbot/tests/unit/basic.test.ts @@ -1,8 +1,5 @@ import { describe, it, expect } from 'vitest' -import { run } from '../../src/index' import * as path from 'node:path' -import { sync as write } from "@polymech/fs/write" -import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" import { models, @@ -11,200 +8,54 @@ import { TEST_PREFERENCES_PATH, TEST_TIMEOUT, TestResult, - formatError, - isEmptyResponse + runTest, + generateTestReport } from './commons' const TEST_LOG_PATH = path.resolve(__dirname, './basic.json') describe('Basic Operations', () => { let testResults: TestResult[] = [] - - - const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => { - let model = 'unknown' - let router = 'unknown' - let startTime = Date.now() - let error: TestResult['error'] | undefined - let testResult: TestResult | undefined - - try { - const result = await Promise.race([ - run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }), - new Promise((_, reject) => - setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT) - ) - ]) as string[] - - if (isEmptyResponse(result)) { - throw new Error('Model returned empty response') - } - - const actual = result?.[0]?.trim()?.toLowerCase() || '' - const passed = actual === expected - - expect(actual).toEqual(expected) - - testResult = { - test: testName, - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - duration: Date.now() - startTime, - reason: passed ? undefined : `Expected ${expected}, but got ${actual}`, - } - } catch (e) { - error = formatError(e) - testResult = { - test: testName, - prompt, - result: [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed: false, - duration: Date.now() - startTime, - error, - reason: error?.message || 'Unknown error occurred' - } - throw e - } finally { - if (testResult) { - testResults.push(testResult) - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) - } - } - } it.each(models)('should add two numbers with model %s', async (modelName) => { - await runBasicTest( + const result = await runTest( 'add 5 and 3. Return only the number, no explanation.', '8', 'addition', - modelName + modelName, + TEST_LOG_PATH ) + testResults.push(result) + expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8') }, { timeout: 10000 }) it.each(models)('should multiply two numbers with model %s', async (modelName) => { - await runBasicTest( + const result = await runTest( 'multiply 8 and 3. Return only the number, no explanation.', '24', 'multiplication', - modelName + modelName, + TEST_LOG_PATH ) + testResults.push(result) + expect(result.result[0]?.trim()?.toLowerCase()).toEqual('24') }, { timeout: 10000 }) it.each(models)('should divide two numbers with model %s', async (modelName) => { - await runBasicTest( + const result = await runTest( 'divide 15 by 3. Return only the number, no explanation.', '5', 'division', - modelName + modelName, + TEST_LOG_PATH ) + testResults.push(result) + expect(result.result[0]?.trim()?.toLowerCase()).toEqual('5') }, { timeout: 10000 }) it('should generate markdown report', () => { - // Group results by test and model - const latestResults = new Map>() - - // Get only the latest result for each test+model combination - testResults.forEach(result => { - if (!latestResults.has(result.test)) { - latestResults.set(result.test, new Map()) - } - const testMap = latestResults.get(result.test)! - const existingResult = testMap.get(result.model) - if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { - testMap.set(result.model, result) - } - }) - - // Generate markdown report - let report = '# Basic Operations Test Results\n\n' - - // First list failed tests - report += '## Failed Tests\n\n' - let hasFailures = false - for (const [testName, modelResults] of latestResults) { - for (const [model, result] of modelResults) { - if (!result.passed) { - hasFailures = true - report += `### ${testName} - ${model}\n` - report += `- Prompt: \`${result.prompt}\`\n` - report += `- Expected: \`${result.expected}\`\n` - report += `- Actual: \`${result.result[0] || ''}\`\n` - report += `- Duration: ${result.duration}ms\n` - if (result.error) { - report += `- Error Type: ${result.error.type}\n` - report += `- Error Code: ${result.error.code}\n` - report += `- Error Message: ${result.error.message}\n` - if (result.error.details?.message) { - report += `- Error Details: ${result.error.details.message}\n` - } - } - report += `- Reason: ${result.reason}\n` - report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` - } - } - } - - if (!hasFailures) { - report += '*No failed tests*\n\n' - } - - // Then list passed tests - report += '## Passed Tests\n\n' - let hasPassed = false - for (const [testName, modelResults] of latestResults) { - for (const [model, result] of modelResults) { - if (result.passed) { - hasPassed = true - report += `### ${testName} - ${model}\n` - report += `- Prompt: \`${result.prompt}\`\n` - report += `- Expected: \`${result.expected}\`\n` - report += `- Actual: \`${result.result[0] || ''}\`\n` - report += `- Duration: ${result.duration}ms\n` - report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` - } - } - } - - if (!hasPassed) { - report += '*No passed tests*\n\n' - } - - // Add summary section - report += '## Summary\n\n' - const totalTests = testResults.length - const passedTests = testResults.filter(r => r.passed).length - const failedTests = totalTests - passedTests - report += `- Total Tests: ${totalTests}\n` - report += `- Passed: ${passedTests}\n` - report += `- Failed: ${failedTests}\n` - report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n` - - // Write report to file const reportPath = path.resolve(__dirname, './basic-report.md') - write(reportPath, report) - - // Verify report was written + generateTestReport(testResults, 'Basic Operations Test Results', reportPath) expect(exists(reportPath) === 'file').toBe(true) }) }) \ No newline at end of file diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts index 7e084fad..408c37b0 100644 --- a/packages/kbot/tests/unit/commons.ts +++ b/packages/kbot/tests/unit/commons.ts @@ -1,5 +1,9 @@ import * as path from 'node:path' import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL } from '../../src/index' +import { run } from '../../src/index' +import { sync as write } from "@polymech/fs/write" +import { sync as read } from "@polymech/fs/read" +import { sync as exists } from "@polymech/fs/exists" export const models = [ //E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, @@ -31,6 +35,15 @@ export interface TestResult { duration?: number } +export interface TestHighscore { + test: string; + rankings: { + model: string; + duration: number; + duration_secs: number; + }[]; +} + export const formatError = (error: any): TestResult['error'] => { return { message: error?.message || 'Unknown error', @@ -43,3 +56,251 @@ export const formatError = (error: any): TestResult['error'] => { export const isEmptyResponse = (result: string[] | null | undefined): boolean => { return !result || result.length === 0 || result.every(r => !r || r.trim() === '') } + +export const generateHighscores = (latestResults: Map>): TestHighscore[] => { + const highscores: TestHighscore[] = [] + + for (const [testName, modelResults] of latestResults) { + // Convert model results to array and sort by duration + const sortedResults = Array.from(modelResults.entries()) + .map(([model, result]) => ({ model, result })) + .sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0)) + .slice(0, 2) // Get top 2 + + if (sortedResults.length > 0) { + highscores.push({ + test: testName, + rankings: sortedResults.map(({ model, result }) => ({ + model, + duration: result.duration || 0, + duration_secs: (result.duration || 0) / 1000 + })) + }) + } + } + + return highscores +} + +export const runTest = async ( + prompt: string, + expected: string, + testName: string, + modelName: string, + logPath: string +): Promise => { + let model = 'unknown' + let router = 'unknown' + let startTime = Date.now() + let error: TestResult['error'] | undefined + let testResult: TestResult | undefined + + try { + const result = await Promise.race([ + run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }), + new Promise((_, reject) => + setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT) + ) + ]) as string[] + + if (isEmptyResponse(result)) { + testResult = { + test: testName, + prompt, + result: [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed: false, + duration: Date.now() - startTime, + reason: 'Model returned empty response' + } + } else { + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected + + testResult = { + test: testName, + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + duration: Date.now() - startTime, + reason: passed ? undefined : `Expected ${expected}, but got ${actual}`, + } + } + } catch (e) { + error = formatError(e) + testResult = { + test: testName, + prompt, + result: [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed: false, + duration: Date.now() - startTime, + error, + reason: error?.message || 'Unknown error occurred' + } + throw e + } finally { + if (testResult) { + const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath)) : { results: [], highscores: [] } + const updatedResults = [...(existingData.results || []), testResult] + + // Group results by test and model + const latestResults = new Map>() + updatedResults.forEach(result => { + if (!latestResults.has(result.test)) { + latestResults.set(result.test, new Map()) + } + const testMap = latestResults.get(result.test)! + const existingResult = testMap.get(result.model) + if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { + testMap.set(result.model, result) + } + }) + + // Generate highscores + const highscores = generateHighscores(latestResults) + + // Write both results and highscores + write(logPath, JSON.stringify({ + results: updatedResults, + highscores, + lastUpdated: new Date().toISOString() + }, null, 2)) + } + } + return testResult +} + +export const generateTestReport = ( + testResults: TestResult[], + reportTitle: string, + reportPath: string +): void => { + // Group results by test and model + const latestResults = new Map>() + + // Get only the latest result for each test+model combination + testResults.forEach(result => { + if (!latestResults.has(result.test)) { + latestResults.set(result.test, new Map()) + } + const testMap = latestResults.get(result.test)! + const existingResult = testMap.get(result.model) + if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { + testMap.set(result.model, result) + } + }) + + // Generate markdown report + let report = `# ${reportTitle}\n\n` + + // Add highscore section + report += '## Highscores\n\n' + for (const [testName, modelResults] of latestResults) { + report += `### ${testName}\n` + + // Convert model results to array and sort by duration + const sortedResults = Array.from(modelResults.entries()) + .map(([model, result]) => ({ model, result })) + .sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0)) + .slice(0, 2) // Get top 2 + + if (sortedResults.length > 0) { + sortedResults.forEach(({ model, result }, index) => { + const duration = result.duration || 0 + report += `${index + 1}. ${model}: ${duration}ms (${(duration / 1000).toFixed(2)}s)\n` + }) + } else { + report += '*No results available*\n' + } + report += '\n' + } + + // Add summary section + report += '## Summary\n\n' + const totalTests = testResults.length + const passedTests = testResults.filter(r => r.passed).length + const failedTests = totalTests - passedTests + const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests + report += `- Total Tests: ${totalTests}\n` + report += `- Passed: ${passedTests}\n` + report += `- Failed: ${failedTests}\n` + report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n` + report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n` + + // First list failed tests + report += '## Failed Tests\n\n' + let hasFailures = false + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (!result.passed) { + hasFailures = true + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n` + if (result.error) { + report += `- Error Type: ${result.error.type}\n` + report += `- Error Code: ${result.error.code}\n` + report += `- Error Message: ${result.error.message}\n` + if (result.error.details?.message) { + report += `- Error Details: ${result.error.details.message}\n` + } + } + report += `- Reason: ${result.reason}\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + if (!hasFailures) { + report += '*No failed tests*\n\n' + } + + // Then list passed tests + report += '## Passed Tests\n\n' + let hasPassed = false + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (result.passed) { + hasPassed = true + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + if (!hasPassed) { + report += '*No passed tests*\n\n' + } + + // Write report to file + write(reportPath, report) +} diff --git a/packages/kbot/tests/unit/format-report.md b/packages/kbot/tests/unit/format-report.md index 6cef7145..68c174ce 100644 --- a/packages/kbot/tests/unit/format-report.md +++ b/packages/kbot/tests/unit/format-report.md @@ -6,21 +6,14 @@ ## Passed Tests -### json-schema-file-format - google/gemini-2.0-flash-lite-001 +### json-schema-file-format - mistralai/mistral-tiny - Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.` - Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}` -- Actual: ````json -{ - "name": "John Doe", - "age": 30, - "tags": ["developer", "javascript"] -} -``` -` -- Duration: 1122ms -- Timestamp: 4/2/2025, 9:03:33 AM +- Actual: `{"name": "John Doe", "age": 30, "tags": ["developer", "javascript"]}` +- Duration: 982ms +- Timestamp: 4/2/2025, 10:26:33 AM -### json-schema-object-format - google/gemini-2.0-flash-lite-001 +### json-schema-object-format - mistralai/mistral-tiny - Prompt: `Create a user profile with the following details: - Name: Jane Smith - Age: 25 @@ -39,8 +32,7 @@ } Return only the JSON object, no explanation.` - Expected: `{"name":"Jane Smith","age":25,"email":"jane.smith@company.com","tags":["developer","designer"],"address":{"street":"123 Main St","city":"New York","country":"US","postal_code":"10001"},"preferences":{"theme":"light","notifications":"enabled","language":"English"}}` -- Actual: ````json -{ +- Actual: `{ "name": "Jane Smith", "age": 25, "email": "jane.smith@company.com", @@ -51,16 +43,14 @@ "country": "US", "postal_code": "10001" }, - "preferences": { + "Preferences": { "theme": "light", "notifications": "enabled", "language": "English" } -} -``` -` -- Duration: 1289ms -- Timestamp: 4/2/2025, 9:03:34 AM +}` +- Duration: 1673ms +- Timestamp: 4/2/2025, 10:26:35 AM ## Summary diff --git a/packages/kbot/tests/unit/format.json b/packages/kbot/tests/unit/format.json index 4e154acb..293695b1 100644 --- a/packages/kbot/tests/unit/format.json +++ b/packages/kbot/tests/unit/format.json @@ -3,26 +3,26 @@ "test": "json-schema-file-format", "prompt": "Create a user profile with name John Doe, age 30, and tags [\"developer\", \"javascript\"]. Return only the JSON object, no explanation.", "result": [ - "```json\n{\n \"name\": \"John Doe\",\n \"age\": 30,\n \"tags\": [\"developer\", \"javascript\"]\n}\n```\n" + "{\"name\": \"John Doe\", \"age\": 30, \"tags\": [\"developer\", \"javascript\"]}" ], "expected": "{\"name\":\"John Doe\",\"age\":30,\"tags\":[\"developer\",\"javascript\"]}", - "model": "google/gemini-2.0-flash-lite-001", + "model": "mistralai/mistral-tiny", "router": "openrouter", - "timestamp": "2025-04-02T07:03:33.038Z", + "timestamp": "2025-04-02T08:26:33.513Z", "passed": true, - "duration": 1122 + "duration": 982 }, { "test": "json-schema-object-format", "prompt": "Create a user profile with the following details:\n - Name: Jane Smith\n - Age: 25\n - Email: jane.smith@company.com\n - Tags: [\"developer\", \"designer\"]\n - Address: {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n }\n - Preferences: {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n Return only the JSON object, no explanation.", "result": [ - "```json\n{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}\n```\n" + "{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"Preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}" ], "expected": "{\"name\":\"Jane Smith\",\"age\":25,\"email\":\"jane.smith@company.com\",\"tags\":[\"developer\",\"designer\"],\"address\":{\"street\":\"123 Main St\",\"city\":\"New York\",\"country\":\"US\",\"postal_code\":\"10001\"},\"preferences\":{\"theme\":\"light\",\"notifications\":\"enabled\",\"language\":\"English\"}}", - "model": "google/gemini-2.0-flash-lite-001", + "model": "mistralai/mistral-tiny", "router": "openrouter", - "timestamp": "2025-04-02T07:03:34.328Z", + "timestamp": "2025-04-02T08:26:35.187Z", "passed": true, - "duration": 1289 + "duration": 1673 } ] \ No newline at end of file diff --git a/packages/kbot/tests/unit/format.test.ts b/packages/kbot/tests/unit/format.test.ts index c7185017..a7c348b5 100644 --- a/packages/kbot/tests/unit/format.test.ts +++ b/packages/kbot/tests/unit/format.test.ts @@ -17,7 +17,8 @@ import { const TEST_LOG_PATH = path.resolve(__dirname, './format.json') const TEST_SCHEMA_PATH = path.resolve(__dirname, './test-schema.json') -const TEST_MODEL = 'google/gemini-2.0-flash-lite-001' +const TEST_MODEL_FAST = 'mistralai/codestral-2501' +const TEST_MODEL = 'mistralai/mistral-tiny' const TEST_ROUTER = 'openrouter' // Sample JSON Schema for testing diff --git a/packages/kbot/tests/unit/language-report.md b/packages/kbot/tests/unit/language-report.md index e0891e6e..60c5a947 100644 --- a/packages/kbot/tests/unit/language-report.md +++ b/packages/kbot/tests/unit/language-report.md @@ -40,6 +40,22 @@ - Reason: Unknown error occurred - Timestamp: 4/1/2025, 11:53:55 PM +### german - anthropic/claude-3.5-sonnet +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `` +- Duration: 1753ms +- Reason: Unknown error occurred +- Timestamp: 4/2/2025, 10:29:45 AM + +### german - qwen/qwq-32b +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `` +- Duration: 13757ms +- Reason: Unknown error occurred +- Timestamp: 4/2/2025, 10:29:59 AM + ### spanish - deepseek/deepseek-chat:free - Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` - Expected: `sí` @@ -78,6 +94,22 @@ - Reason: Unknown error occurred - Timestamp: 4/1/2025, 11:53:56 PM +### spanish - anthropic/claude-3.5-sonnet +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `` +- Duration: 1242ms +- Reason: Unknown error occurred +- Timestamp: 4/2/2025, 10:29:51 AM + +### spanish - qwen/qwq-32b +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `` +- Duration: 5925ms +- Reason: Unknown error occurred +- Timestamp: 4/2/2025, 10:29:57 AM + ### french - deepseek/deepseek-chat:free - Prompt: `translate "no" to French. Return only the translated word, no explanation.` - Expected: `non` @@ -116,6 +148,22 @@ - Reason: Unknown error occurred - Timestamp: 4/1/2025, 11:53:57 PM +### french - anthropic/claude-3.5-sonnet +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `` +- Duration: 2656ms +- Reason: Unknown error occurred +- Timestamp: 4/2/2025, 10:29:59 AM + +### french - qwen/qwq-32b +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `` +- Duration: 4063ms +- Reason: Unknown error occurred +- Timestamp: 4/2/2025, 10:30:03 AM + ## Passed Tests ### german_translation - deepseek/deepseek-chat:free diff --git a/packages/kbot/tests/unit/language.json b/packages/kbot/tests/unit/language.json index df969918..17dde7ba 100644 --- a/packages/kbot/tests/unit/language.json +++ b/packages/kbot/tests/unit/language.json @@ -843,5 +843,77 @@ "passed": false, "duration": 968, "reason": "Unknown error occurred" + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [], + "expected": "hallo", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T08:29:45.316Z", + "passed": false, + "duration": 1753, + "reason": "Unknown error occurred" + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [], + "expected": "sí", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T08:29:51.577Z", + "passed": false, + "duration": 1242, + "reason": "Unknown error occurred" + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [], + "expected": "sí", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T08:29:57.503Z", + "passed": false, + "duration": 5925, + "reason": "Unknown error occurred" + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [], + "expected": "hallo", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T08:29:59.075Z", + "passed": false, + "duration": 13757, + "reason": "Unknown error occurred" + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [], + "expected": "non", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T08:29:59.249Z", + "passed": false, + "duration": 2656, + "reason": "Unknown error occurred" + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [], + "expected": "non", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T08:30:03.313Z", + "passed": false, + "duration": 4063, + "reason": "Unknown error occurred" } ] \ No newline at end of file