From 91517c7ac1ab41217c2f0806af4b7a8168e494c7 Mon Sep 17 00:00:00 2001 From: babayaga Date: Tue, 1 Apr 2025 13:02:25 +0200 Subject: [PATCH] tests:language - merci, habsch selbstgemacht :) --- packages/kbot/logs/params.json | 2 +- packages/kbot/package.json | 3 +- packages/kbot/tests/unit/basic-report.md | 109 ++++ packages/kbot/tests/unit/basic.json | 552 ++++++++++++++++++++ packages/kbot/tests/unit/basic.test.ts | 106 ++-- packages/kbot/tests/unit/commons.ts | 24 + packages/kbot/tests/unit/language-report.md | 113 ++++ packages/kbot/tests/unit/language.json | 530 +++++++++++++++++++ packages/kbot/tests/unit/language.test.ts | 200 +++++++ packages/kbot/tests/unit/math-report.md | 22 +- packages/kbot/tests/unit/math.json | 432 +++++++++++++++ packages/kbot/tests/unit/math.test.ts | 117 +++-- 12 files changed, 2129 insertions(+), 81 deletions(-) create mode 100644 packages/kbot/tests/unit/basic-report.md create mode 100644 packages/kbot/tests/unit/commons.ts create mode 100644 packages/kbot/tests/unit/language-report.md create mode 100644 packages/kbot/tests/unit/language.json create mode 100644 packages/kbot/tests/unit/language.test.ts diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json index 331cfc4a..d897f083 100644 --- a/packages/kbot/logs/params.json +++ b/packages/kbot/logs/params.json @@ -3,7 +3,7 @@ "messages": [ { "role": "user", - "content": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation." + "content": "divide 15 by 3. Return only the number, no explanation." }, { "role": "user", diff --git a/packages/kbot/package.json b/packages/kbot/package.json index 302467f4..eea2c0f6 100644 --- a/packages/kbot/package.json +++ b/packages/kbot/package.json @@ -17,8 +17,9 @@ "dev": "tsc -p . --watch", "lint": "eslint src --ext .ts", "test": "vitest run", - "test:unit": "vitest run tests/unit", + "test:basic": "vitest run tests/unit/basic.test.ts", "test:math": "vitest run tests/unit/math.test.ts", + "test:language": "vitest run tests/unit/language.test.ts", "test2:watch": "vitest", "test2:coverage": "vitest run --coverage", "webpack": "webpack --config webpack.config.js --stats-error-details", diff --git a/packages/kbot/tests/unit/basic-report.md b/packages/kbot/tests/unit/basic-report.md new file mode 100644 index 00000000..5d0b3391 --- /dev/null +++ b/packages/kbot/tests/unit/basic-report.md @@ -0,0 +1,109 @@ +# Basic Operations Test Results + +## Failed Tests + +### basic_arithmetic - deepseek/deepseek-chat:free +- Prompt: `return the result of 2+2, dont comment` +- Expected: `undefined` +- Actual: `4` +- Reason: undefined +- Timestamp: 4/1/2025, 12:26:30 PM + +### basic_arithmetic - google/gemini-2.0-flash-exp:free +- Prompt: `return the result of 2+2, dont comment` +- Expected: `undefined` +- Actual: `4 +` +- Reason: undefined +- Timestamp: 4/1/2025, 12:26:31 PM + +### basic_arithmetic - gpt-4 +- Prompt: `return the result of 2+2, dont comment` +- Expected: `undefined` +- Actual: `4` +- Reason: undefined +- Timestamp: 4/1/2025, 12:26:32 PM + +### json_structure - deepseek/deepseek-chat:free +- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.` +- Expected: `undefined` +- Actual: `{"name":"test","value":42}` +- Reason: undefined +- Timestamp: 4/1/2025, 12:26:33 PM + +### json_structure - gpt-4 +- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.` +- Expected: `undefined` +- Actual: `{"name": "test", "value": 42}` +- Reason: undefined +- Timestamp: 4/1/2025, 12:26:36 PM + +### json_structure - google/gemini-2.0-flash-exp:free +- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.` +- Expected: `undefined` +- Actual: `{ + "name": "test", + "value": 42 +}` +- Reason: undefined +- Timestamp: 4/1/2025, 12:26:35 PM + +## Passed Tests + +### addition - deepseek/deepseek-chat:free +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Timestamp: 4/1/2025, 12:59:06 PM + +### addition - google/gemini-2.0-flash-exp:free +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8 +` +- Timestamp: 4/1/2025, 12:59:08 PM + +### addition - gpt-4 +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Timestamp: 4/1/2025, 1:00:35 PM + +### multiplication - deepseek/deepseek-chat:free +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Timestamp: 4/1/2025, 12:59:13 PM + +### multiplication - google/gemini-2.0-flash-exp:free +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24 +` +- Timestamp: 4/1/2025, 12:59:15 PM + +### multiplication - gpt-4 +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Timestamp: 4/1/2025, 1:00:37 PM + +### division - deepseek/deepseek-chat:free +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Timestamp: 4/1/2025, 12:59:18 PM + +### division - google/gemini-2.0-flash-exp:free +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5 +` +- Timestamp: 4/1/2025, 12:56:09 PM + +### division - gpt-4 +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Timestamp: 4/1/2025, 1:00:40 PM + diff --git a/packages/kbot/tests/unit/basic.json b/packages/kbot/tests/unit/basic.json index 8f54473b..76593563 100644 --- a/packages/kbot/tests/unit/basic.json +++ b/packages/kbot/tests/unit/basic.json @@ -228,5 +228,557 @@ "model": "gpt-4", "router": "openrouter", "timestamp": "2025-04-01T10:26:36.239Z" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:03.882Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8\n" + ], + "expected": "8", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:05.343Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:06.365Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:07.564Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24\n" + ], + "expected": "24", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:09.013Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:09.968Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:10.981Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5\n" + ], + "expected": "5", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:12.829Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:13.957Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:47.732Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8\n" + ], + "expected": "8", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:50.075Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:50.977Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:52.835Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24\n" + ], + "expected": "24", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:54.875Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:55.866Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:56.950Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:59.317Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:01.223Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8\n" + ], + "expected": "8", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:02.717Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:03.867Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:04.864Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24\n" + ], + "expected": "24", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:05.973Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:07.075Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:08.103Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5\n" + ], + "expected": "5", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:09.298Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:10.178Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:32.641Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 4 and 6. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:35.038Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:37.567Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:06.742Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8\n" + ], + "expected": "8", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:08.889Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:09.871Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:13.627Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24\n" + ], + "expected": "24", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:15.963Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:17.124Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:18.527Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:20.298Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:58.244Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:00.620Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:03.143Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:10.210Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:12.798Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:15.604Z", + "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:35.256Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:37.681Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:00:40.556Z", + "passed": true } ] \ No newline at end of file diff --git a/packages/kbot/tests/unit/basic.test.ts b/packages/kbot/tests/unit/basic.test.ts index 043822a4..000669e2 100644 --- a/packages/kbot/tests/unit/basic.test.ts +++ b/packages/kbot/tests/unit/basic.test.ts @@ -4,32 +4,10 @@ import * as path from 'node:path' import { sync as write } from "@polymech/fs/write" import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" +import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons' -import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' - -const models = [ - E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, - E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE, - E_OPENAI_MODEL.MODEL_GPT_4 -] - -const TEST_BASE_PATH = path.resolve(__dirname, '../../') -const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs') -const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md') const TEST_LOG_PATH = path.resolve(__dirname, './basic.json') -interface TestResult { - test: string; - prompt: string; - result: string[]; - expected: string; - model: string; - router: string; - timestamp: string; - passed: boolean; - reason?: string; -} - describe('Basic Operations', () => { let testResults: TestResult[] = [] @@ -58,8 +36,8 @@ describe('Basic Operations', () => { return options } }) as string[] - - const actual = result.map(r => r.trim())[0] + + const actual = result?.[0]?.trim() || '' const passed = actual === expected expect(actual).toEqual(expected) @@ -67,13 +45,13 @@ describe('Basic Operations', () => { testResults.push({ test: 'addition', prompt, - result, + result: result || [], expected, model, router, timestamp: new Date().toISOString(), passed, - reason: passed ? undefined : `Expected ${expected}, but got ${actual}` + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` }) // Write all results to the same file @@ -81,7 +59,7 @@ describe('Basic Operations', () => { }) it.each(models)('should multiply two numbers with model %s', async (modelName) => { - const prompt = 'multiply 4 and 6. Return only the number, no explanation.' + const prompt = 'multiply 8 and 3. Return only the number, no explanation.' const expected = '24' let model = 'unknown' let router = 'unknown' @@ -98,9 +76,9 @@ describe('Basic Operations', () => { router = options.router || 'unknown' return options } - }) as string[] + }) as string[] - const actual = result.map(r => r.trim())[0] + const actual = result?.[0]?.trim() || '' const passed = actual === expected expect(actual).toEqual(expected) @@ -108,13 +86,13 @@ describe('Basic Operations', () => { testResults.push({ test: 'multiplication', prompt, - result, + result: result || [], expected, model, router, timestamp: new Date().toISOString(), passed, - reason: passed ? undefined : `Expected ${expected}, but got ${actual}` + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` }) // Write all results to the same file @@ -139,9 +117,9 @@ describe('Basic Operations', () => { router = options.router || 'unknown' return options } - }) as string[] + }) as string[] - const actual = result.map(r => r.trim())[0] + const actual = result?.[0]?.trim() || '' const passed = actual === expected expect(actual).toEqual(expected) @@ -149,16 +127,72 @@ describe('Basic Operations', () => { testResults.push({ test: 'division', prompt, - result, + result: result || [], expected, model, router, timestamp: new Date().toISOString(), passed, - reason: passed ? undefined : `Expected ${expected}, but got ${actual}` + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` }) // Write all results to the same file write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) }) + + it('should generate markdown report', () => { + // Group results by test and model + const latestResults = new Map>() + + // Get only the latest result for each test+model combination + testResults.forEach(result => { + if (!latestResults.has(result.test)) { + latestResults.set(result.test, new Map()) + } + const testMap = latestResults.get(result.test)! + const existingResult = testMap.get(result.model) + if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { + testMap.set(result.model, result) + } + }) + + // Generate markdown report + let report = '# Basic Operations Test Results\n\n' + + // First list failed tests + report += '## Failed Tests\n\n' + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (!result.passed) { + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Reason: ${result.reason}\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + // Then list passed tests + report += '## Passed Tests\n\n' + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (result.passed) { + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + // Write report to file + const reportPath = path.resolve(__dirname, './basic-report.md') + write(reportPath, report) + + // Verify report was written + expect(exists(reportPath) === 'file').toBe(true) + }) }) \ No newline at end of file diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts new file mode 100644 index 00000000..9a3d5f50 --- /dev/null +++ b/packages/kbot/tests/unit/commons.ts @@ -0,0 +1,24 @@ +import * as path from 'node:path' +import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' + +export const models = [ + E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, + E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE, + E_OPENAI_MODEL.MODEL_GPT_4 +] + +export const TEST_BASE_PATH = path.resolve(__dirname, '../../') +export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs') +export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md') + +export interface TestResult { + test: string; + prompt: string; + result: string[]; + expected: string; + model: string; + router: string; + timestamp: string; + passed: boolean; + reason?: string; +} \ No newline at end of file diff --git a/packages/kbot/tests/unit/language-report.md b/packages/kbot/tests/unit/language-report.md new file mode 100644 index 00000000..8d778ee4 --- /dev/null +++ b/packages/kbot/tests/unit/language-report.md @@ -0,0 +1,113 @@ +# Language Test Results + +## Failed Tests + +## Passed Tests + +### german_translation - deepseek/deepseek-chat:free +- Prompt: `translate "hello" to German. Return only the translation, no explanation.` +- Expected: `hallo` +- Actual: `Hallo` +- Timestamp: 4/1/2025, 12:56:01 PM + +### german_translation - google/gemini-2.0-flash-exp:free +- Prompt: `translate "hello" to German. Return only the translation, no explanation.` +- Expected: `hallo` +- Actual: `Hallo +` +- Timestamp: 4/1/2025, 12:56:02 PM + +### german_translation - gpt-4 +- Prompt: `translate "hello" to German. Return only the translation, no explanation.` +- Expected: `hallo` +- Actual: `Hallo` +- Timestamp: 4/1/2025, 12:56:32 PM + +### spanish_translation - deepseek/deepseek-chat:free +- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.` +- Expected: `sí` +- Actual: `sí` +- Timestamp: 4/1/2025, 12:56:05 PM + +### spanish_translation - google/gemini-2.0-flash-exp:free +- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.` +- Expected: `sí` +- Actual: `sí +` +- Timestamp: 4/1/2025, 12:56:06 PM + +### spanish_translation - gpt-4 +- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.` +- Expected: `sí` +- Actual: `sí` +- Timestamp: 4/1/2025, 12:56:35 PM + +### french_translation - deepseek/deepseek-chat:free +- Prompt: `translate "no" to French. Return only the translation, no explanation.` +- Expected: `non` +- Actual: `non` +- Timestamp: 4/1/2025, 12:56:08 PM + +### french_translation - google/gemini-2.0-flash-exp:free +- Prompt: `translate "no" to French. Return only the translation, no explanation.` +- Expected: `non` +- Actual: `non +` +- Timestamp: 4/1/2025, 12:56:10 PM + +### french_translation - gpt-4 +- Prompt: `translate "no" to French. Return only the translation, no explanation.` +- Expected: `non` +- Actual: `non` +- Timestamp: 4/1/2025, 12:56:37 PM + +### german - deepseek/deepseek-chat:free +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `Hallo` +- Timestamp: 4/1/2025, 12:59:07 PM + +### german - google/gemini-2.0-flash-exp:free +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `Hallo +` +- Timestamp: 4/1/2025, 12:59:09 PM + +### german - gpt-4 +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `Hallo` +- Timestamp: 4/1/2025, 12:59:11 PM + +### spanish - deepseek/deepseek-chat:free +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `Sí` +- Timestamp: 4/1/2025, 12:59:12 PM + +### spanish - google/gemini-2.0-flash-exp:free +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `Sí +` +- Timestamp: 4/1/2025, 12:59:14 PM + +### spanish - gpt-4 +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `sí` +- Timestamp: 4/1/2025, 12:59:15 PM + +### french - deepseek/deepseek-chat:free +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `non` +- Timestamp: 4/1/2025, 12:59:18 PM + +### french - gpt-4 +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `non` +- Timestamp: 4/1/2025, 12:59:20 PM + diff --git a/packages/kbot/tests/unit/language.json b/packages/kbot/tests/unit/language.json new file mode 100644 index 00000000..68177172 --- /dev/null +++ b/packages/kbot/tests/unit/language.json @@ -0,0 +1,530 @@ +[ + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:01.527Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo\n" + ], + "expected": "hallo", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:03.169Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:04.655Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:05.726Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí\n" + ], + "expected": "sí", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:08.264Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:09.514Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:11.281Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non\n" + ], + "expected": "non", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:12.942Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:50:14.089Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:51:53.360Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo\n" + ], + "expected": "hallo", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:51:54.551Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:51:55.699Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:51:56.740Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "Sí\n" + ], + "expected": "sí", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:51:57.918Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "Sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:51:58.882Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:52:00.119Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "Non\n" + ], + "expected": "non", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:52:01.184Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:52:02.204Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:50.924Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo\n" + ], + "expected": "hallo", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:52.477Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:53.546Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:56.815Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:58.718Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:54:01.673Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:01.199Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo\n" + ], + "expected": "hallo", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:02.857Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:03.788Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:05.218Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí\n" + ], + "expected": "sí", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:06.317Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:07.436Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:08.879Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non\n" + ], + "expected": "non", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:10.365Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:11.295Z", + "passed": true + }, + { + "test": "german_translation", + "prompt": "translate \"hello\" to German. Return only the translation, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:32.735Z", + "passed": true + }, + { + "test": "spanish_translation", + "prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:35.276Z", + "passed": true + }, + { + "test": "french_translation", + "prompt": "translate \"no\" to French. Return only the translation, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:37.673Z", + "passed": true + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:07.944Z", + "passed": true + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [ + "Hallo\n" + ], + "expected": "hallo", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:09.471Z", + "passed": true + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:11.335Z", + "passed": true + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [ + "Sí" + ], + "expected": "sí", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:12.740Z", + "passed": true + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [ + "Sí\n" + ], + "expected": "sí", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:14.246Z", + "passed": true + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:15.205Z", + "passed": true + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:18.139Z", + "passed": true + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:20.589Z", + "passed": true + } +] \ No newline at end of file diff --git a/packages/kbot/tests/unit/language.test.ts b/packages/kbot/tests/unit/language.test.ts new file mode 100644 index 00000000..4eed7007 --- /dev/null +++ b/packages/kbot/tests/unit/language.test.ts @@ -0,0 +1,200 @@ +import { describe, it, expect } from 'vitest' +import { run } from '../../src/index' +import * as path from 'node:path' +import { sync as write } from "@polymech/fs/write" +import { sync as read } from "@polymech/fs/read" +import { sync as exists } from "@polymech/fs/exists" +import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons' + +import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' + +const TEST_LOG_PATH = path.resolve(__dirname, './language.json') + +describe('Language Capabilities', () => { + let testResults: TestResult[] = [] + + // Load existing results if any + if (exists(TEST_LOG_PATH)) { + const data = read(TEST_LOG_PATH, 'json') + testResults = Array.isArray(data) ? data : [] + } + + it.each(models)('should translate "hello" to German with model %s', async (modelName) => { + const prompt = 'translate "hello" to German. Return only the translated word, no explanation.' + const expected = 'hallo' + let model = 'unknown' + let router = 'unknown' + + const result = await run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }) as string[] + + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected + expect(actual).toEqual(expected) + + // Add test result to array + testResults.push({ + test: 'german', + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` + }) + + // Write all results to the same file + write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + }) + + it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => { + const prompt = 'translate "yes" to Spanish. Return only the translated word, no explanation.' + const expected = 'sí' + let model = 'unknown' + let router = 'unknown' + + const result = await run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }) as string[] + + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected + expect(actual).toEqual(expected) + + // Add test result to array + testResults.push({ + test: 'spanish', + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` + }) + + // Write all results to the same file + write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + }) + + it.each(models)('should translate "no" to French with model %s', async (modelName) => { + const prompt = 'translate "no" to French. Return only the translated word, no explanation.' + const expected = 'non' + let model = 'unknown' + let router = 'unknown' + + const result = await run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }) as string[] + + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected + expect(actual).toEqual(expected) + + // Add test result to array + testResults.push({ + test: 'french', + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` + }) + + // Write all results to the same file + write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + }) + + it('should generate markdown report', () => { + // Group results by test and model + const latestResults = new Map>() + + // Get only the latest result for each test+model combination + testResults.forEach(result => { + if (!latestResults.has(result.test)) { + latestResults.set(result.test, new Map()) + } + const testMap = latestResults.get(result.test)! + const existingResult = testMap.get(result.model) + if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { + testMap.set(result.model, result) + } + }) + + // Generate markdown report + let report = '# Language Test Results\n\n' + + // First list failed tests + report += '## Failed Tests\n\n' + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (!result.passed) { + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Reason: ${result.reason}\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + // Then list passed tests + report += '## Passed Tests\n\n' + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (result.passed) { + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + // Write report to file + const reportPath = path.resolve(__dirname, './language-report.md') + write(reportPath, report) + + // Verify report was written + expect(exists(reportPath) === 'file').toBe(true) + }) +}) \ No newline at end of file diff --git a/packages/kbot/tests/unit/math-report.md b/packages/kbot/tests/unit/math-report.md index 8e14405d..8590997b 100644 --- a/packages/kbot/tests/unit/math-report.md +++ b/packages/kbot/tests/unit/math-report.md @@ -8,55 +8,55 @@ - Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.` - Expected: `120` - Actual: `120` -- Timestamp: 4/1/2025, 12:42:00 PM +- Timestamp: 4/1/2025, 12:59:09 PM ### factorial - google/gemini-2.0-flash-exp:free - Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.` - Expected: `120` - Actual: `120 ` -- Timestamp: 4/1/2025, 12:42:02 PM +- Timestamp: 4/1/2025, 12:59:10 PM ### factorial - gpt-4 - Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.` - Expected: `120` - Actual: `120` -- Timestamp: 4/1/2025, 12:42:03 PM +- Timestamp: 4/1/2025, 12:59:11 PM ### fibonacci - deepseek/deepseek-chat:free - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.` - Expected: `0,1,1,2,3` - Actual: `0,1,1,2,3` -- Timestamp: 4/1/2025, 12:41:49 PM +- Timestamp: 4/1/2025, 12:59:13 PM ### fibonacci - google/gemini-2.0-flash-exp:free - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.` - Expected: `0,1,1,2,3` - Actual: `0,1,1,2,3 ` -- Timestamp: 4/1/2025, 12:42:10 PM +- Timestamp: 4/1/2025, 12:59:14 PM ### fibonacci - gpt-4 - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.` - Expected: `0,1,1,2,3` - Actual: `0, 1, 1, 2, 3` -- Timestamp: 4/1/2025, 12:42:12 PM +- Timestamp: 4/1/2025, 12:59:16 PM ### quadratic - deepseek/deepseek-chat:free - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.` - Expected: `[-3,-2]` - Actual: `[-2, -3]` -- Timestamp: 4/1/2025, 12:42:16 PM +- Timestamp: 4/1/2025, 12:59:19 PM ### quadratic - google/gemini-2.0-flash-exp:free - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.` - Expected: `[-3,-2]` -- Actual: `[-2.0, -3.0]` -- Timestamp: 4/1/2025, 12:42:17 PM +- Actual: `[-2, -3]` +- Timestamp: 4/1/2025, 12:46:13 PM ### quadratic - gpt-4 - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.` - Expected: `[-3,-2]` -- Actual: `[-2, -3]` -- Timestamp: 4/1/2025, 12:42:18 PM +- Actual: `["-2", "-3"]` +- Timestamp: 4/1/2025, 12:59:20 PM diff --git a/packages/kbot/tests/unit/math.json b/packages/kbot/tests/unit/math.json index 7a8a2f4a..9803f2e7 100644 --- a/packages/kbot/tests/unit/math.json +++ b/packages/kbot/tests/unit/math.json @@ -552,5 +552,437 @@ "router": "openrouter", "timestamp": "2025-04-01T10:42:18.707Z", "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:04.344Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120\n" + ], + "expected": "120", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:06.496Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:07.467Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3" + ], + "expected": "0,1,1,2,3", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:08.798Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3\n" + ], + "expected": "0,1,1,2,3", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:09.986Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0, 1, 1, 2, 3" + ], + "expected": "0,1,1,2,3", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:11.342Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:12.578Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:13.716Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:46:14.857Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:47.605Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120\n" + ], + "expected": "120", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:49.748Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:50.767Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3" + ], + "expected": "0,1,1,2,3", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:52.399Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3\n" + ], + "expected": "0,1,1,2,3", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:53.994Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0, 1, 1, 2, 3" + ], + "expected": "0,1,1,2,3", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:55.463Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:56.689Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:53:59.331Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:01.640Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120\n" + ], + "expected": "120", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:03.712Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:04.767Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3" + ], + "expected": "0,1,1,2,3", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:07.138Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3\n" + ], + "expected": "0,1,1,2,3", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:08.328Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0, 1, 1, 2, 3" + ], + "expected": "0,1,1,2,3", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:09.776Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:10.999Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:14.081Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:32.818Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0, 1, 1, 2, 3" + ], + "expected": "0,1,1,2,3", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:35.563Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2,-3]" + ], + "expected": "[-3,-2]", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:56:37.672Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:09.054Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120\n" + ], + "expected": "120", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:10.238Z", + "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:11.167Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3" + ], + "expected": "0,1,1,2,3", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:13.500Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0,1,1,2,3\n" + ], + "expected": "0,1,1,2,3", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:14.679Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0, 1, 1, 2, 3" + ], + "expected": "0,1,1,2,3", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:16.094Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:19.103Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[\"-2\", \"-3\"]" + ], + "expected": "[-3,-2]", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T10:59:20.963Z", + "passed": true } ] \ No newline at end of file diff --git a/packages/kbot/tests/unit/math.test.ts b/packages/kbot/tests/unit/math.test.ts index e671c811..cbbd585b 100644 --- a/packages/kbot/tests/unit/math.test.ts +++ b/packages/kbot/tests/unit/math.test.ts @@ -4,32 +4,12 @@ import * as path from 'node:path' import { sync as write } from "@polymech/fs/write" import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" +import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons' import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' -const models = [ - E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, - E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE, - E_OPENAI_MODEL.MODEL_GPT_4 -] - -const TEST_BASE_PATH = path.resolve(__dirname, '../../') -const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs') -const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md') const TEST_LOG_PATH = path.resolve(__dirname, './math.json') -interface TestResult { - test: string; - prompt: string; - result: string[]; - expected: string; - model: string; - router: string; - timestamp: string; - passed: boolean; - reason?: string; -} - describe('Advanced Math Operations', () => { let testResults: TestResult[] = [] @@ -57,9 +37,9 @@ describe('Advanced Math Operations', () => { router = options.router || 'unknown' return options } - }) as string[] - - const actual = result.map(r => r.trim())[0] + }) as string[] + + const actual = result?.[0]?.trim() || '' const passed = actual === expected expect(actual).toEqual(expected) @@ -67,13 +47,13 @@ describe('Advanced Math Operations', () => { testResults.push({ test: 'factorial', prompt, - result, + result: result || [], expected, model, router, timestamp: new Date().toISOString(), passed, - reason: passed ? undefined : `Expected ${expected}, but got ${actual}` + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` }) // Write all results to the same file @@ -101,7 +81,7 @@ describe('Advanced Math Operations', () => { }) as string[] // Handle both formats: "0,1,1,2,3" and "0, 1, 1, 2, 3" - const numbers = result[0].trim().split(',').map(n => n.trim()) + const numbers = result?.[0]?.trim()?.split(',')?.map(n => n.trim()) || [] const actual = numbers.join(',') const passed = actual === expected expect(numbers).toEqual(['0', '1', '1', '2', '3']) @@ -110,13 +90,13 @@ describe('Advanced Math Operations', () => { testResults.push({ test: 'fibonacci', prompt, - result, + result: result || [], expected, model, router, timestamp: new Date().toISOString(), passed, - reason: passed ? undefined : `Expected ${expected}, but got ${actual}` + reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` }) // Write all results to the same file @@ -145,7 +125,24 @@ describe('Advanced Math Operations', () => { }) as string[] // Parse the result as JSON (markdown already stripped by filter) - const jsonResult = JSON.parse(result[0].trim()) + let jsonResult: number[] + try { + const resultStr = result?.[0]?.trim() || '[]' + if (!resultStr) { + throw new Error('No result returned from model') + } + jsonResult = JSON.parse(resultStr) + if (!Array.isArray(jsonResult)) { + throw new Error('Result is not an array') + } + // Convert any string numbers to actual numbers + jsonResult = jsonResult.map(n => typeof n === 'string' ? parseFloat(n) : n) + } catch (error) { + // If parsing fails, try to extract numbers from the string + const numbers = result?.[0]?.match(/-?\d+/g)?.map(n => parseInt(n, 10)) || [] + jsonResult = numbers + } + const actual = JSON.stringify(jsonResult.sort()) const expectedSorted = JSON.stringify([-3, -2].sort()) const passed = actual === expectedSorted @@ -155,16 +152,72 @@ describe('Advanced Math Operations', () => { testResults.push({ test: 'quadratic', prompt, - result, + result: result || [], expected: expectedDisplay, model, router, timestamp: new Date().toISOString(), passed, - reason: passed ? undefined : `Expected ${expectedDisplay}, but got ${result[0]}` + reason: !result?.[0] ? 'No result returned from model' : passed ? undefined : `Expected ${expectedDisplay}, but got ${result?.[0] || ''}` }) // Write all results to the same file write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) }) + + it('should generate markdown report', () => { + // Group results by test and model + const latestResults = new Map>() + + // Get only the latest result for each test+model combination + testResults.forEach(result => { + if (!latestResults.has(result.test)) { + latestResults.set(result.test, new Map()) + } + const testMap = latestResults.get(result.test)! + const existingResult = testMap.get(result.model) + if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { + testMap.set(result.model, result) + } + }) + + // Generate markdown report + let report = '# Math Test Results\n\n' + + // First list failed tests + report += '## Failed Tests\n\n' + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (!result.passed) { + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Reason: ${result.reason}\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + // Then list passed tests + report += '## Passed Tests\n\n' + for (const [testName, modelResults] of latestResults) { + for (const [model, result] of modelResults) { + if (result.passed) { + report += `### ${testName} - ${model}\n` + report += `- Prompt: \`${result.prompt}\`\n` + report += `- Expected: \`${result.expected}\`\n` + report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` + } + } + } + + // Write report to file + const reportPath = path.resolve(__dirname, './math-report.md') + write(reportPath, report) + + // Verify report was written + expect(exists(reportPath) === 'file').toBe(true) + }) }) \ No newline at end of file