From 75af5d1a26813f2f017c62da56bce1a7aedcb164 Mon Sep 17 00:00:00 2001 From: babayaga Date: Tue, 1 Apr 2025 13:10:17 +0200 Subject: [PATCH] tests:language - merci, habsch selbstgemacht :) --- packages/kbot/logs/params.json | 2 +- packages/kbot/tests/unit/basic-report.md | 6 +- packages/kbot/tests/unit/basic.json | 36 +++ packages/kbot/tests/unit/basic.test.ts | 240 +++++++++--------- packages/kbot/tests/unit/commons.ts | 64 +++++ packages/kbot/tests/unit/language-report.md | 149 +++++++---- packages/kbot/tests/unit/language.json | 174 +++++++++++++ packages/kbot/tests/unit/language.test.ts | 232 ++++++++--------- packages/kbot/tests/unit/math-report.md | 8 +- packages/kbot/tests/unit/math.json | 36 +++ packages/kbot/tests/unit/math.test.ts | 261 ++++++++++---------- 11 files changed, 790 insertions(+), 418 deletions(-) diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json index d897f083..66bb0162 100644 --- a/packages/kbot/logs/params.json +++ b/packages/kbot/logs/params.json @@ -3,7 +3,7 @@ "messages": [ { "role": "user", - "content": "divide 15 by 3. Return only the number, no explanation." + "content": "translate \"no\" to French. Return only the translated word, no explanation." }, { "role": "user", diff --git a/packages/kbot/tests/unit/basic-report.md b/packages/kbot/tests/unit/basic-report.md index 5d0b3391..ffa234b0 100644 --- a/packages/kbot/tests/unit/basic-report.md +++ b/packages/kbot/tests/unit/basic-report.md @@ -67,7 +67,7 @@ - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Timestamp: 4/1/2025, 1:00:35 PM +- Timestamp: 4/1/2025, 1:02:55 PM ### multiplication - deepseek/deepseek-chat:free - Prompt: `multiply 8 and 3. Return only the number, no explanation.` @@ -86,7 +86,7 @@ - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Timestamp: 4/1/2025, 1:00:37 PM +- Timestamp: 4/1/2025, 1:02:57 PM ### division - deepseek/deepseek-chat:free - Prompt: `divide 15 by 3. Return only the number, no explanation.` @@ -105,5 +105,5 @@ - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Timestamp: 4/1/2025, 1:00:40 PM +- Timestamp: 4/1/2025, 1:03:00 PM diff --git a/packages/kbot/tests/unit/basic.json b/packages/kbot/tests/unit/basic.json index 76593563..abca2685 100644 --- a/packages/kbot/tests/unit/basic.json +++ b/packages/kbot/tests/unit/basic.json @@ -780,5 +780,41 @@ "router": "openrouter", "timestamp": "2025-04-01T11:00:40.556Z", "passed": true + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:02:55.210Z", + "passed": true + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:02:57.579Z", + "passed": true + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:00.064Z", + "passed": true } ] \ No newline at end of file diff --git a/packages/kbot/tests/unit/basic.test.ts b/packages/kbot/tests/unit/basic.test.ts index 000669e2..c051df61 100644 --- a/packages/kbot/tests/unit/basic.test.ts +++ b/packages/kbot/tests/unit/basic.test.ts @@ -4,11 +4,22 @@ import * as path from 'node:path' import { sync as write } from "@polymech/fs/write" import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" -import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons' +import { + models, + TEST_BASE_PATH, + TEST_LOGS_PATH, + TEST_PREFERENCES_PATH, + TEST_TIMEOUT, + TestResult, + formatError, + isEmptyResponse, + getRouterForModel, + getApiKeyForRouter +} from './commons' const TEST_LOG_PATH = path.resolve(__dirname, './basic.json') -describe('Basic Operations', () => { +describe('Basic Capabilities', () => { let testResults: TestResult[] = [] // Load existing results if any @@ -17,127 +28,109 @@ describe('Basic Operations', () => { testResults = Array.isArray(data) ? data : [] } - it.each(models)('should add two numbers with model %s', async (modelName) => { - const prompt = 'add 5 and 3. Return only the number, no explanation.' - const expected = '8' + const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => { let model = 'unknown' let router = 'unknown' + let startTime = Date.now() + let error: TestResult['error'] | undefined - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options + try { + const result = await Promise.race([ + run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }), + new Promise((_, reject) => + setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT) + ) + ]) as string[] + + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected && !isEmptyResponse(result) + + if (isEmptyResponse(result)) { + throw new Error('Model returned empty response') } - }) as string[] + + expect(actual).toEqual(expected) - const actual = result?.[0]?.trim() || '' - const passed = actual === expected - expect(actual).toEqual(expected) + return { + test: testName, + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + duration: Date.now() - startTime, + reason: passed ? undefined : `Expected ${expected}, but got ${actual}`, + config: { + router: getRouterForModel(modelName), + apiKey: getApiKeyForRouter(getRouterForModel(modelName)) + } + } + } catch (e) { + error = formatError(e) + throw e + } finally { + const testResult: TestResult = { + test: testName, + prompt, + result: [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed: false, + duration: Date.now() - startTime, + error, + reason: error?.message || 'Unknown error occurred', + config: { + router: getRouterForModel(modelName), + apiKey: getApiKeyForRouter(getRouterForModel(modelName)) + } + } + + testResults.push(testResult) + write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + } + } - // Add test result to array - testResults.push({ - test: 'addition', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + it.each(models)('should respond to "hello" with model %s', async (modelName) => { + await runBasicTest( + 'say "hello"', + 'hello', + 'hello', + modelName + ) }) - it.each(models)('should multiply two numbers with model %s', async (modelName) => { - const prompt = 'multiply 8 and 3. Return only the number, no explanation.' - const expected = '24' - let model = 'unknown' - let router = 'unknown' - - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - const actual = result?.[0]?.trim() || '' - const passed = actual === expected - expect(actual).toEqual(expected) - - // Add test result to array - testResults.push({ - test: 'multiplication', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + it.each(models)('should respond to "goodbye" with model %s', async (modelName) => { + await runBasicTest( + 'say "goodbye"', + 'goodbye', + 'goodbye', + modelName + ) }) - it.each(models)('should divide two numbers with model %s', async (modelName) => { - const prompt = 'divide 15 by 3. Return only the number, no explanation.' - const expected = '5' - let model = 'unknown' - let router = 'unknown' - - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - const actual = result?.[0]?.trim() || '' - const passed = actual === expected - expect(actual).toEqual(expected) - - // Add test result to array - testResults.push({ - test: 'division', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + it.each(models)('should respond to "yes" with model %s', async (modelName) => { + await runBasicTest( + 'say "yes"', + 'yes', + 'yes', + modelName + ) }) it('should generate markdown report', () => { @@ -157,36 +150,55 @@ describe('Basic Operations', () => { }) // Generate markdown report - let report = '# Basic Operations Test Results\n\n' + let report = '# Basic Test Results\n\n' // First list failed tests report += '## Failed Tests\n\n' + let hasFailures = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (!result.passed) { + hasFailures = true report += `### ${testName} - ${model}\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms\n` + if (result.error) { + report += `- Error Type: ${result.error.type}\n` + report += `- Error Code: ${result.error.code}\n` + report += `- Error Message: ${result.error.message}\n` + } report += `- Reason: ${result.reason}\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } + + if (!hasFailures) { + report += '*No failed tests*\n\n' + } // Then list passed tests report += '## Passed Tests\n\n' + let hasPassed = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (result.passed) { + hasPassed = true report += `### ${testName} - ${model}\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } + + if (!hasPassed) { + report += '*No passed tests*\n\n' + } // Write report to file const reportPath = path.resolve(__dirname, './basic-report.md') diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts index 9a3d5f50..c3289c12 100644 --- a/packages/kbot/tests/unit/commons.ts +++ b/packages/kbot/tests/unit/commons.ts @@ -1,6 +1,18 @@ import * as path from 'node:path' import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' +// Test configuration +export const TEST_CONFIG = { + openrouter: { + key: process.env.OPENROUTER_API_KEY || '', + org: process.env.OPENROUTER_ORG_ID || '' + }, + openai: { + key: process.env.OPENAI_API_KEY || '', + org: process.env.OPENAI_ORG_ID || '' + } +} + export const models = [ E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE, @@ -10,6 +22,7 @@ export const models = [ export const TEST_BASE_PATH = path.resolve(__dirname, '../../') export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs') export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md') +export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls export interface TestResult { test: string; @@ -21,4 +34,55 @@ export interface TestResult { timestamp: string; passed: boolean; reason?: string; + error?: { + message: string; + code?: string; + type?: string; + details?: any; + }; + duration?: number; + config?: { + apiKey?: string; + baseURL?: string; + router?: string; + }; +} + +export const formatError = (error: any): TestResult['error'] => { + return { + message: error?.message || 'Unknown error', + code: error?.code || 'UNKNOWN', + type: error?.type || error?.constructor?.name || 'Error', + details: error?.response?.data || error?.response || error + } +} + +export const isEmptyResponse = (result: string[] | null | undefined): boolean => { + return !result || result.length === 0 || result.every(r => !r || r.trim() === '') +} + +export const getRouterForModel = (model: string): string => { + if (model.startsWith('gpt-')) return 'openai' + return 'openrouter' +} + +export const getApiKeyForRouter = (router: string): string => { + switch (router) { + case 'openai': + return TEST_CONFIG.openai.key + case 'openrouter': + return TEST_CONFIG.openrouter.key + default: + return '' + } +} + +export const validateConfig = () => { + const missingKeys: string[] = [] + if (!TEST_CONFIG.openrouter.key) missingKeys.push('OPENROUTER_API_KEY') + if (!TEST_CONFIG.openai.key) missingKeys.push('OPENAI_API_KEY') + + if (missingKeys.length > 0) { + throw new Error(`Missing required environment variables: ${missingKeys.join(', ')}`) + } } \ No newline at end of file diff --git a/packages/kbot/tests/unit/language-report.md b/packages/kbot/tests/unit/language-report.md index 8d778ee4..61484aa9 100644 --- a/packages/kbot/tests/unit/language-report.md +++ b/packages/kbot/tests/unit/language-report.md @@ -2,12 +2,103 @@ ## Failed Tests +### german - deepseek/deepseek-chat:free +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `` +- Duration: 985ms +- Error Type: Error +- Error Code: UNKNOWN +- Error Message: Model returned empty response +- Reason: Model returned empty response +- Timestamp: 4/1/2025, 1:05:50 PM + +### german - google/gemini-2.0-flash-exp:free +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `` +- Duration: 746ms +- Error Type: Error +- Error Code: UNKNOWN +- Error Message: Model returned empty response +- Reason: Model returned empty response +- Timestamp: 4/1/2025, 1:05:51 PM + +### german - gpt-4 +- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` +- Expected: `hallo` +- Actual: `` +- Duration: 1067ms +- Reason: Unknown error occurred +- Timestamp: 4/1/2025, 1:05:52 PM + +### spanish - deepseek/deepseek-chat:free +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `` +- Duration: 678ms +- Error Type: Error +- Error Code: UNKNOWN +- Error Message: Model returned empty response +- Reason: Model returned empty response +- Timestamp: 4/1/2025, 1:05:53 PM + +### spanish - google/gemini-2.0-flash-exp:free +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `` +- Duration: 744ms +- Error Type: Error +- Error Code: UNKNOWN +- Error Message: Model returned empty response +- Reason: Model returned empty response +- Timestamp: 4/1/2025, 1:05:53 PM + +### spanish - gpt-4 +- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` +- Expected: `sí` +- Actual: `` +- Duration: 1125ms +- Reason: Unknown error occurred +- Timestamp: 4/1/2025, 1:05:55 PM + +### french - deepseek/deepseek-chat:free +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `` +- Duration: 626ms +- Error Type: Error +- Error Code: UNKNOWN +- Error Message: Model returned empty response +- Reason: Model returned empty response +- Timestamp: 4/1/2025, 1:05:55 PM + +### french - gpt-4 +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `` +- Duration: 1341ms +- Reason: Unknown error occurred +- Timestamp: 4/1/2025, 1:05:57 PM + +### french - google/gemini-2.0-flash-exp:free +- Prompt: `translate "no" to French. Return only the translated word, no explanation.` +- Expected: `non` +- Actual: `` +- Duration: 729ms +- Error Type: Error +- Error Code: UNKNOWN +- Error Message: Model returned empty response +- Reason: Model returned empty response +- Timestamp: 4/1/2025, 1:05:56 PM + ## Passed Tests ### german_translation - deepseek/deepseek-chat:free - Prompt: `translate "hello" to German. Return only the translation, no explanation.` - Expected: `hallo` - Actual: `Hallo` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:01 PM ### german_translation - google/gemini-2.0-flash-exp:free @@ -15,18 +106,21 @@ - Expected: `hallo` - Actual: `Hallo ` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:02 PM ### german_translation - gpt-4 - Prompt: `translate "hello" to German. Return only the translation, no explanation.` - Expected: `hallo` - Actual: `Hallo` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:32 PM ### spanish_translation - deepseek/deepseek-chat:free - Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.` - Expected: `sí` - Actual: `sí` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:05 PM ### spanish_translation - google/gemini-2.0-flash-exp:free @@ -34,18 +128,21 @@ - Expected: `sí` - Actual: `sí ` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:06 PM ### spanish_translation - gpt-4 - Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.` - Expected: `sí` - Actual: `sí` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:35 PM ### french_translation - deepseek/deepseek-chat:free - Prompt: `translate "no" to French. Return only the translation, no explanation.` - Expected: `non` - Actual: `non` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:08 PM ### french_translation - google/gemini-2.0-flash-exp:free @@ -53,61 +150,13 @@ - Expected: `non` - Actual: `non ` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:10 PM ### french_translation - gpt-4 - Prompt: `translate "no" to French. Return only the translation, no explanation.` - Expected: `non` - Actual: `non` +- Duration: undefinedms - Timestamp: 4/1/2025, 12:56:37 PM -### german - deepseek/deepseek-chat:free -- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` -- Expected: `hallo` -- Actual: `Hallo` -- Timestamp: 4/1/2025, 12:59:07 PM - -### german - google/gemini-2.0-flash-exp:free -- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` -- Expected: `hallo` -- Actual: `Hallo -` -- Timestamp: 4/1/2025, 12:59:09 PM - -### german - gpt-4 -- Prompt: `translate "hello" to German. Return only the translated word, no explanation.` -- Expected: `hallo` -- Actual: `Hallo` -- Timestamp: 4/1/2025, 12:59:11 PM - -### spanish - deepseek/deepseek-chat:free -- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` -- Expected: `sí` -- Actual: `Sí` -- Timestamp: 4/1/2025, 12:59:12 PM - -### spanish - google/gemini-2.0-flash-exp:free -- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` -- Expected: `sí` -- Actual: `Sí -` -- Timestamp: 4/1/2025, 12:59:14 PM - -### spanish - gpt-4 -- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.` -- Expected: `sí` -- Actual: `sí` -- Timestamp: 4/1/2025, 12:59:15 PM - -### french - deepseek/deepseek-chat:free -- Prompt: `translate "no" to French. Return only the translated word, no explanation.` -- Expected: `non` -- Actual: `non` -- Timestamp: 4/1/2025, 12:59:18 PM - -### french - gpt-4 -- Prompt: `translate "no" to French. Return only the translated word, no explanation.` -- Expected: `non` -- Actual: `non` -- Timestamp: 4/1/2025, 12:59:20 PM - diff --git a/packages/kbot/tests/unit/language.json b/packages/kbot/tests/unit/language.json index 68177172..9fc6d16e 100644 --- a/packages/kbot/tests/unit/language.json +++ b/packages/kbot/tests/unit/language.json @@ -526,5 +526,179 @@ "router": "openrouter", "timestamp": "2025-04-01T10:59:20.589Z", "passed": true + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [ + "Hallo" + ], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:09.890Z", + "passed": true + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [ + "sí" + ], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:12.312Z", + "passed": true + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [ + "non" + ], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:14.660Z", + "passed": true + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [], + "expected": "hallo", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:50.723Z", + "passed": false, + "duration": 985, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error" + }, + "reason": "Model returned empty response" + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [], + "expected": "hallo", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:51.471Z", + "passed": false, + "duration": 746, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error" + }, + "reason": "Model returned empty response" + }, + { + "test": "german", + "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.", + "result": [], + "expected": "hallo", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:52.540Z", + "passed": false, + "duration": 1067, + "reason": "Unknown error occurred" + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [], + "expected": "sí", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:53.219Z", + "passed": false, + "duration": 678, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error" + }, + "reason": "Model returned empty response" + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [], + "expected": "sí", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:53.964Z", + "passed": false, + "duration": 744, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error" + }, + "reason": "Model returned empty response" + }, + { + "test": "spanish", + "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.", + "result": [], + "expected": "sí", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:55.090Z", + "passed": false, + "duration": 1125, + "reason": "Unknown error occurred" + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [], + "expected": "non", + "model": "deepseek/deepseek-chat:free", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:55.717Z", + "passed": false, + "duration": 626, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error" + }, + "reason": "Model returned empty response" + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [], + "expected": "non", + "model": "google/gemini-2.0-flash-exp:free", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:56.447Z", + "passed": false, + "duration": 729, + "error": { + "message": "Model returned empty response", + "code": "UNKNOWN", + "type": "Error" + }, + "reason": "Model returned empty response" + }, + { + "test": "french", + "prompt": "translate \"no\" to French. Return only the translated word, no explanation.", + "result": [], + "expected": "non", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:05:57.790Z", + "passed": false, + "duration": 1341, + "reason": "Unknown error occurred" } ] \ No newline at end of file diff --git a/packages/kbot/tests/unit/language.test.ts b/packages/kbot/tests/unit/language.test.ts index 4eed7007..919273ba 100644 --- a/packages/kbot/tests/unit/language.test.ts +++ b/packages/kbot/tests/unit/language.test.ts @@ -4,7 +4,18 @@ import * as path from 'node:path' import { sync as write } from "@polymech/fs/write" import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" -import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons' +import { + models, + TEST_BASE_PATH, + TEST_LOGS_PATH, + TEST_PREFERENCES_PATH, + TEST_TIMEOUT, + TestResult, + formatError, + isEmptyResponse, + getRouterForModel, + getApiKeyForRouter +} from './commons' import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' @@ -19,127 +30,109 @@ describe('Language Capabilities', () => { testResults = Array.isArray(data) ? data : [] } - it.each(models)('should translate "hello" to German with model %s', async (modelName) => { - const prompt = 'translate "hello" to German. Return only the translated word, no explanation.' - const expected = 'hallo' + const runTranslationTest = async (prompt: string, expected: string, testName: string, modelName: string) => { let model = 'unknown' let router = 'unknown' + let startTime = Date.now() + let error: TestResult['error'] | undefined - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options + try { + const result = await Promise.race([ + run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }), + new Promise((_, reject) => + setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT) + ) + ]) as string[] + + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected && !isEmptyResponse(result) + + if (isEmptyResponse(result)) { + throw new Error('Model returned empty response') } - }) as string[] + + expect(actual).toEqual(expected) - const actual = result?.[0]?.trim()?.toLowerCase() || '' - const passed = actual === expected - expect(actual).toEqual(expected) + return { + test: testName, + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + duration: Date.now() - startTime, + reason: passed ? undefined : `Expected ${expected}, but got ${actual}`, + config: { + router: getRouterForModel(modelName), + apiKey: getApiKeyForRouter(getRouterForModel(modelName)) + } + } + } catch (e) { + error = formatError(e) + throw e + } finally { + const testResult: TestResult = { + test: testName, + prompt, + result: [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed: false, + duration: Date.now() - startTime, + error, + reason: error?.message || 'Unknown error occurred', + config: { + router: getRouterForModel(modelName), + apiKey: getApiKeyForRouter(getRouterForModel(modelName)) + } + } + + testResults.push(testResult) + write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + } + } - // Add test result to array - testResults.push({ - test: 'german', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + it.each(models)('should translate "hello" to German with model %s', async (modelName) => { + await runTranslationTest( + 'translate "hello" to German. Return only the translated word, no explanation.', + 'hallo', + 'german', + modelName + ) }) it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => { - const prompt = 'translate "yes" to Spanish. Return only the translated word, no explanation.' - const expected = 'sí' - let model = 'unknown' - let router = 'unknown' - - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - const actual = result?.[0]?.trim()?.toLowerCase() || '' - const passed = actual === expected - expect(actual).toEqual(expected) - - // Add test result to array - testResults.push({ - test: 'spanish', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + await runTranslationTest( + 'translate "yes" to Spanish. Return only the translated word, no explanation.', + 'sí', + 'spanish', + modelName + ) }) it.each(models)('should translate "no" to French with model %s', async (modelName) => { - const prompt = 'translate "no" to French. Return only the translated word, no explanation.' - const expected = 'non' - let model = 'unknown' - let router = 'unknown' - - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - const actual = result?.[0]?.trim()?.toLowerCase() || '' - const passed = actual === expected - expect(actual).toEqual(expected) - - // Add test result to array - testResults.push({ - test: 'french', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + await runTranslationTest( + 'translate "no" to French. Return only the translated word, no explanation.', + 'non', + 'french', + modelName + ) }) it('should generate markdown report', () => { @@ -163,32 +156,51 @@ describe('Language Capabilities', () => { // First list failed tests report += '## Failed Tests\n\n' + let hasFailures = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (!result.passed) { + hasFailures = true report += `### ${testName} - ${model}\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms\n` + if (result.error) { + report += `- Error Type: ${result.error.type}\n` + report += `- Error Code: ${result.error.code}\n` + report += `- Error Message: ${result.error.message}\n` + } report += `- Reason: ${result.reason}\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } + + if (!hasFailures) { + report += '*No failed tests*\n\n' + } // Then list passed tests report += '## Passed Tests\n\n' + let hasPassed = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (result.passed) { + hasPassed = true report += `### ${testName} - ${model}\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } + + if (!hasPassed) { + report += '*No passed tests*\n\n' + } // Write report to file const reportPath = path.resolve(__dirname, './language-report.md') diff --git a/packages/kbot/tests/unit/math-report.md b/packages/kbot/tests/unit/math-report.md index 8590997b..df14d555 100644 --- a/packages/kbot/tests/unit/math-report.md +++ b/packages/kbot/tests/unit/math-report.md @@ -21,7 +21,7 @@ - Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.` - Expected: `120` - Actual: `120` -- Timestamp: 4/1/2025, 12:59:11 PM +- Timestamp: 4/1/2025, 1:03:25 PM ### fibonacci - deepseek/deepseek-chat:free - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.` @@ -40,7 +40,7 @@ - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.` - Expected: `0,1,1,2,3` - Actual: `0, 1, 1, 2, 3` -- Timestamp: 4/1/2025, 12:59:16 PM +- Timestamp: 4/1/2025, 1:03:27 PM ### quadratic - deepseek/deepseek-chat:free - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.` @@ -57,6 +57,6 @@ ### quadratic - gpt-4 - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.` - Expected: `[-3,-2]` -- Actual: `["-2", "-3"]` -- Timestamp: 4/1/2025, 12:59:20 PM +- Actual: `[-2, -3]` +- Timestamp: 4/1/2025, 1:03:30 PM diff --git a/packages/kbot/tests/unit/math.json b/packages/kbot/tests/unit/math.json index 9803f2e7..99c7071d 100644 --- a/packages/kbot/tests/unit/math.json +++ b/packages/kbot/tests/unit/math.json @@ -984,5 +984,41 @@ "router": "openrouter", "timestamp": "2025-04-01T10:59:20.963Z", "passed": true + }, + { + "test": "factorial", + "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:25.009Z", + "passed": true + }, + { + "test": "fibonacci", + "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.", + "result": [ + "0, 1, 1, 2, 3" + ], + "expected": "0,1,1,2,3", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:27.826Z", + "passed": true + }, + { + "test": "quadratic", + "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.", + "result": [ + "[-2, -3]" + ], + "expected": "[-3,-2]", + "model": "gpt-4", + "router": "openrouter", + "timestamp": "2025-04-01T11:03:30.579Z", + "passed": true } ] \ No newline at end of file diff --git a/packages/kbot/tests/unit/math.test.ts b/packages/kbot/tests/unit/math.test.ts index cbbd585b..60f6a5fe 100644 --- a/packages/kbot/tests/unit/math.test.ts +++ b/packages/kbot/tests/unit/math.test.ts @@ -4,13 +4,24 @@ import * as path from 'node:path' import { sync as write } from "@polymech/fs/write" import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" -import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons' +import { + models, + TEST_BASE_PATH, + TEST_LOGS_PATH, + TEST_PREFERENCES_PATH, + TEST_TIMEOUT, + TestResult, + formatError, + isEmptyResponse, + getRouterForModel, + getApiKeyForRouter +} from './commons' import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index' const TEST_LOG_PATH = path.resolve(__dirname, './math.json') -describe('Advanced Math Operations', () => { +describe('Math Capabilities', () => { let testResults: TestResult[] = [] // Load existing results if any @@ -19,150 +30,109 @@ describe('Advanced Math Operations', () => { testResults = Array.isArray(data) ? data : [] } - it.each(models)('should calculate factorial of 5 with model %s', async (modelName) => { - const prompt = 'calculate the factorial of 5 (5!). Return only the number, no explanation.' - const expected = '120' + const runMathTest = async (prompt: string, expected: string, testName: string, modelName: string) => { let model = 'unknown' let router = 'unknown' + let startTime = Date.now() + let error: TestResult['error'] | undefined - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - const actual = result?.[0]?.trim() || '' - const passed = actual === expected - expect(actual).toEqual(expected) - - // Add test result to array - testResults.push({ - test: 'factorial', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) - }) - - it.each(models)('should calculate fibonacci sequence up to 5th number with model %s', async (modelName) => { - const prompt = 'calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.' - const expected = '0,1,1,2,3' - let model = 'unknown' - let router = 'unknown' - - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - // Handle both formats: "0,1,1,2,3" and "0, 1, 1, 2, 3" - const numbers = result?.[0]?.trim()?.split(',')?.map(n => n.trim()) || [] - const actual = numbers.join(',') - const passed = actual === expected - expect(numbers).toEqual(['0', '1', '1', '2', '3']) - - // Add test result to array - testResults.push({ - test: 'fibonacci', - prompt, - result: result || [], - expected, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}` - }) - - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) - }) - - it.each(models)('should solve quadratic equation x² + 5x + 6 = 0 with model %s', async (modelName) => { - const prompt = 'solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.' - const expectedDisplay = '[-3,-2]' - let model = 'unknown' - let router = 'unknown' - - const result = await run({ - prompt, - mode: 'completion', - model: modelName, - filters: 'code', - path: TEST_BASE_PATH, - logs: TEST_LOGS_PATH, - preferences: TEST_PREFERENCES_PATH, - onRun: async (options) => { - model = options.model || 'unknown' - router = options.router || 'unknown' - return options - } - }) as string[] - - // Parse the result as JSON (markdown already stripped by filter) - let jsonResult: number[] try { - const resultStr = result?.[0]?.trim() || '[]' - if (!resultStr) { - throw new Error('No result returned from model') + const result = await Promise.race([ + run({ + prompt, + mode: 'completion', + model: modelName, + path: TEST_BASE_PATH, + logs: TEST_LOGS_PATH, + preferences: TEST_PREFERENCES_PATH, + onRun: async (options) => { + model = options.model || 'unknown' + router = options.router || 'unknown' + return options + } + }), + new Promise((_, reject) => + setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT) + ) + ]) as string[] + + const actual = result?.[0]?.trim()?.toLowerCase() || '' + const passed = actual === expected && !isEmptyResponse(result) + + if (isEmptyResponse(result)) { + throw new Error('Model returned empty response') } - jsonResult = JSON.parse(resultStr) - if (!Array.isArray(jsonResult)) { - throw new Error('Result is not an array') + + expect(actual).toEqual(expected) + + return { + test: testName, + prompt, + result: result || [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed, + duration: Date.now() - startTime, + reason: passed ? undefined : `Expected ${expected}, but got ${actual}`, + config: { + router: getRouterForModel(modelName), + apiKey: getApiKeyForRouter(getRouterForModel(modelName)) + } } - // Convert any string numbers to actual numbers - jsonResult = jsonResult.map(n => typeof n === 'string' ? parseFloat(n) : n) - } catch (error) { - // If parsing fails, try to extract numbers from the string - const numbers = result?.[0]?.match(/-?\d+/g)?.map(n => parseInt(n, 10)) || [] - jsonResult = numbers + } catch (e) { + error = formatError(e) + throw e + } finally { + const testResult: TestResult = { + test: testName, + prompt, + result: [], + expected, + model, + router, + timestamp: new Date().toISOString(), + passed: false, + duration: Date.now() - startTime, + error, + reason: error?.message || 'Unknown error occurred', + config: { + router: getRouterForModel(modelName), + apiKey: getApiKeyForRouter(getRouterForModel(modelName)) + } + } + + testResults.push(testResult) + write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) } + } - const actual = JSON.stringify(jsonResult.sort()) - const expectedSorted = JSON.stringify([-3, -2].sort()) - const passed = actual === expectedSorted - expect(jsonResult.sort()).toEqual([-3, -2].sort()) + it.each(models)('should add two numbers with model %s', async (modelName) => { + await runMathTest( + 'add 5 and 3. Return only the number, no explanation.', + '8', + 'addition', + modelName + ) + }) - // Add test result to array - testResults.push({ - test: 'quadratic', - prompt, - result: result || [], - expected: expectedDisplay, - model, - router, - timestamp: new Date().toISOString(), - passed, - reason: !result?.[0] ? 'No result returned from model' : passed ? undefined : `Expected ${expectedDisplay}, but got ${result?.[0] || ''}` - }) + it.each(models)('should multiply two numbers with model %s', async (modelName) => { + await runMathTest( + 'multiply 8 and 3. Return only the number, no explanation.', + '24', + 'multiplication', + modelName + ) + }) - // Write all results to the same file - write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2)) + it.each(models)('should divide two numbers with model %s', async (modelName) => { + await runMathTest( + 'divide 15 by 3. Return only the number, no explanation.', + '5', + 'division', + modelName + ) }) it('should generate markdown report', () => { @@ -186,32 +156,51 @@ describe('Advanced Math Operations', () => { // First list failed tests report += '## Failed Tests\n\n' + let hasFailures = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (!result.passed) { + hasFailures = true report += `### ${testName} - ${model}\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms\n` + if (result.error) { + report += `- Error Type: ${result.error.type}\n` + report += `- Error Code: ${result.error.code}\n` + report += `- Error Message: ${result.error.message}\n` + } report += `- Reason: ${result.reason}\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } + + if (!hasFailures) { + report += '*No failed tests*\n\n' + } // Then list passed tests report += '## Passed Tests\n\n' + let hasPassed = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (result.passed) { + hasPassed = true report += `### ${testName} - ${model}\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` + report += `- Duration: ${result.duration}ms\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } + + if (!hasPassed) { + report += '*No passed tests*\n\n' + } // Write report to file const reportPath = path.resolve(__dirname, './math-report.md')