From 75af5d1a26813f2f017c62da56bce1a7aedcb164 Mon Sep 17 00:00:00 2001
From: babayaga <cgoflyn@gmail.com>
Date: Tue, 1 Apr 2025 13:10:17 +0200
Subject: [PATCH] tests:language - merci, habsch selbstgemacht :)

---
 packages/kbot/logs/params.json              |   2 +-
 packages/kbot/tests/unit/basic-report.md    |   6 +-
 packages/kbot/tests/unit/basic.json         |  36 +++
 packages/kbot/tests/unit/basic.test.ts      | 240 +++++++++---------
 packages/kbot/tests/unit/commons.ts         |  64 +++++
 packages/kbot/tests/unit/language-report.md | 149 +++++++----
 packages/kbot/tests/unit/language.json      | 174 +++++++++++++
 packages/kbot/tests/unit/language.test.ts   | 232 ++++++++---------
 packages/kbot/tests/unit/math-report.md     |   8 +-
 packages/kbot/tests/unit/math.json          |  36 +++
 packages/kbot/tests/unit/math.test.ts       | 261 ++++++++++----------
 11 files changed, 790 insertions(+), 418 deletions(-)

diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json
index d897f083..66bb0162 100644
--- a/packages/kbot/logs/params.json
+++ b/packages/kbot/logs/params.json
@@ -3,7 +3,7 @@
   "messages": [
     {
       "role": "user",
-      "content": "divide 15 by 3. Return only the number, no explanation."
+      "content": "translate \"no\" to French. Return only the translated word, no explanation."
     },
     {
       "role": "user",
diff --git a/packages/kbot/tests/unit/basic-report.md b/packages/kbot/tests/unit/basic-report.md
index 5d0b3391..ffa234b0 100644
--- a/packages/kbot/tests/unit/basic-report.md
+++ b/packages/kbot/tests/unit/basic-report.md
@@ -67,7 +67,7 @@
 - Prompt: `add 5 and 3. Return only the number, no explanation.`
 - Expected: `8`
 - Actual: `8`
-- Timestamp: 4/1/2025, 1:00:35 PM
+- Timestamp: 4/1/2025, 1:02:55 PM
 
 ### multiplication - deepseek/deepseek-chat:free
 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
@@ -86,7 +86,7 @@
 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
 - Expected: `24`
 - Actual: `24`
-- Timestamp: 4/1/2025, 1:00:37 PM
+- Timestamp: 4/1/2025, 1:02:57 PM
 
 ### division - deepseek/deepseek-chat:free
 - Prompt: `divide 15 by 3. Return only the number, no explanation.`
@@ -105,5 +105,5 @@
 - Prompt: `divide 15 by 3. Return only the number, no explanation.`
 - Expected: `5`
 - Actual: `5`
-- Timestamp: 4/1/2025, 1:00:40 PM
+- Timestamp: 4/1/2025, 1:03:00 PM
 
diff --git a/packages/kbot/tests/unit/basic.json b/packages/kbot/tests/unit/basic.json
index 76593563..abca2685 100644
--- a/packages/kbot/tests/unit/basic.json
+++ b/packages/kbot/tests/unit/basic.json
@@ -780,5 +780,41 @@
     "router": "openrouter",
     "timestamp": "2025-04-01T11:00:40.556Z",
     "passed": true
+  },
+  {
+    "test": "addition",
+    "prompt": "add 5 and 3. Return only the number, no explanation.",
+    "result": [
+      "8"
+    ],
+    "expected": "8",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:02:55.210Z",
+    "passed": true
+  },
+  {
+    "test": "multiplication",
+    "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+    "result": [
+      "24"
+    ],
+    "expected": "24",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:02:57.579Z",
+    "passed": true
+  },
+  {
+    "test": "division",
+    "prompt": "divide 15 by 3. Return only the number, no explanation.",
+    "result": [
+      "5"
+    ],
+    "expected": "5",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:00.064Z",
+    "passed": true
   }
 ]
\ No newline at end of file
diff --git a/packages/kbot/tests/unit/basic.test.ts b/packages/kbot/tests/unit/basic.test.ts
index 000669e2..c051df61 100644
--- a/packages/kbot/tests/unit/basic.test.ts
+++ b/packages/kbot/tests/unit/basic.test.ts
@@ -4,11 +4,22 @@ import * as path from 'node:path'
 import { sync as write } from "@polymech/fs/write"
 import { sync as read } from "@polymech/fs/read"
 import { sync as exists } from "@polymech/fs/exists"
-import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
+import { 
+  models, 
+  TEST_BASE_PATH, 
+  TEST_LOGS_PATH, 
+  TEST_PREFERENCES_PATH, 
+  TEST_TIMEOUT, 
+  TestResult, 
+  formatError, 
+  isEmptyResponse,
+  getRouterForModel,
+  getApiKeyForRouter
+} from './commons'
 
 const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
 
-describe('Basic Operations', () => {
+describe('Basic Capabilities', () => {
   let testResults: TestResult[] = []
   
   // Load existing results if any
@@ -17,127 +28,109 @@ describe('Basic Operations', () => {
     testResults = Array.isArray(data) ? data : []
   }
 
-  it.each(models)('should add two numbers with model %s', async (modelName) => {
-    const prompt = 'add 5 and 3. Return only the number, no explanation.'
-    const expected = '8'
+  const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
     let model = 'unknown'
     let router = 'unknown'
+    let startTime = Date.now()
+    let error: TestResult['error'] | undefined
     
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
+    try {
+      const result = await Promise.race([
+        run({
+          prompt,
+          mode: 'completion',
+          model: modelName,
+          path: TEST_BASE_PATH,
+          logs: TEST_LOGS_PATH,
+          preferences: TEST_PREFERENCES_PATH,
+          onRun: async (options) => {
+            model = options.model || 'unknown'
+            router = options.router || 'unknown'
+            return options
+          }
+        }),
+        new Promise((_, reject) => 
+          setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
+        )
+      ]) as string[]
+
+      const actual = result?.[0]?.trim()?.toLowerCase() || ''
+      const passed = actual === expected && !isEmptyResponse(result)
+      
+      if (isEmptyResponse(result)) {
+        throw new Error('Model returned empty response')
       }
-    }) as string[]    
+      
+      expect(actual).toEqual(expected)
 
-    const actual = result?.[0]?.trim() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
+      return {
+        test: testName,
+        prompt,
+        result: result || [],
+        expected,
+        model,
+        router,
+        timestamp: new Date().toISOString(),
+        passed,
+        duration: Date.now() - startTime,
+        reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
+        config: {
+          router: getRouterForModel(modelName),
+          apiKey: getApiKeyForRouter(getRouterForModel(modelName))
+        }
+      }
+    } catch (e) {
+      error = formatError(e)
+      throw e
+    } finally {
+      const testResult: TestResult = {
+        test: testName,
+        prompt,
+        result: [],
+        expected,
+        model,
+        router,
+        timestamp: new Date().toISOString(),
+        passed: false,
+        duration: Date.now() - startTime,
+        error,
+        reason: error?.message || 'Unknown error occurred',
+        config: {
+          router: getRouterForModel(modelName),
+          apiKey: getApiKeyForRouter(getRouterForModel(modelName))
+        }
+      }
+      
+      testResults.push(testResult)
+      write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+    }
+  }
 
-    // Add test result to array
-    testResults.push({
-      test: 'addition',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+  it.each(models)('should respond to "hello" with model %s', async (modelName) => {
+    await runBasicTest(
+      'say "hello"',
+      'hello',
+      'hello',
+      modelName
+    )
   })
 
-  it.each(models)('should multiply two numbers with model %s', async (modelName) => {
-    const prompt = 'multiply 8 and 3. Return only the number, no explanation.'
-    const expected = '24'
-    let model = 'unknown'
-    let router = 'unknown'
-    
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]
-
-    const actual = result?.[0]?.trim() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
-
-    // Add test result to array
-    testResults.push({
-      test: 'multiplication',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+  it.each(models)('should respond to "goodbye" with model %s', async (modelName) => {
+    await runBasicTest(
+      'say "goodbye"',
+      'goodbye',
+      'goodbye',
+      modelName
+    )
   })
 
-  it.each(models)('should divide two numbers with model %s', async (modelName) => {
-    const prompt = 'divide 15 by 3. Return only the number, no explanation.'
-    const expected = '5'
-    let model = 'unknown'
-    let router = 'unknown'
-    
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]
-
-    const actual = result?.[0]?.trim() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
-
-    // Add test result to array
-    testResults.push({
-      test: 'division',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+  it.each(models)('should respond to "yes" with model %s', async (modelName) => {
+    await runBasicTest(
+      'say "yes"',
+      'yes',
+      'yes',
+      modelName
+    )
   })
 
   it('should generate markdown report', () => {
@@ -157,36 +150,55 @@ describe('Basic Operations', () => {
     })
 
     // Generate markdown report
-    let report = '# Basic Operations Test Results\n\n'
+    let report = '# Basic Test Results\n\n'
     
     // First list failed tests
     report += '## Failed Tests\n\n'
+    let hasFailures = false
     for (const [testName, modelResults] of latestResults) {
       for (const [model, result] of modelResults) {
         if (!result.passed) {
+          hasFailures = true
           report += `### ${testName} - ${model}\n`
           report += `- Prompt: \`${result.prompt}\`\n`
           report += `- Expected: \`${result.expected}\`\n`
           report += `- Actual: \`${result.result[0] || ''}\`\n`
+          report += `- Duration: ${result.duration}ms\n`
+          if (result.error) {
+            report += `- Error Type: ${result.error.type}\n`
+            report += `- Error Code: ${result.error.code}\n`
+            report += `- Error Message: ${result.error.message}\n`
+          }
           report += `- Reason: ${result.reason}\n`
           report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
         }
       }
     }
+    
+    if (!hasFailures) {
+      report += '*No failed tests*\n\n'
+    }
 
     // Then list passed tests
     report += '## Passed Tests\n\n'
+    let hasPassed = false
     for (const [testName, modelResults] of latestResults) {
       for (const [model, result] of modelResults) {
         if (result.passed) {
+          hasPassed = true
           report += `### ${testName} - ${model}\n`
           report += `- Prompt: \`${result.prompt}\`\n`
           report += `- Expected: \`${result.expected}\`\n`
           report += `- Actual: \`${result.result[0] || ''}\`\n`
+          report += `- Duration: ${result.duration}ms\n`
           report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
         }
       }
     }
+    
+    if (!hasPassed) {
+      report += '*No passed tests*\n\n'
+    }
 
     // Write report to file
     const reportPath = path.resolve(__dirname, './basic-report.md')
diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts
index 9a3d5f50..c3289c12 100644
--- a/packages/kbot/tests/unit/commons.ts
+++ b/packages/kbot/tests/unit/commons.ts
@@ -1,6 +1,18 @@
 import * as path from 'node:path'
 import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
 
+// Test configuration
+export const TEST_CONFIG = {
+    openrouter: {
+        key: process.env.OPENROUTER_API_KEY || '',
+        org: process.env.OPENROUTER_ORG_ID || ''
+    },
+    openai: {
+        key: process.env.OPENAI_API_KEY || '',
+        org: process.env.OPENAI_ORG_ID || ''
+    }
+}
+
 export const models = [
     E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, 
     E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE,
@@ -10,6 +22,7 @@ export const models = [
 export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
 export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
 export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
+export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
 
 export interface TestResult {
   test: string;
@@ -21,4 +34,55 @@ export interface TestResult {
   timestamp: string;
   passed: boolean;
   reason?: string;
+  error?: {
+    message: string;
+    code?: string;
+    type?: string;
+    details?: any;
+  };
+  duration?: number;
+  config?: {
+    apiKey?: string;
+    baseURL?: string;
+    router?: string;
+  };
+}
+
+export const formatError = (error: any): TestResult['error'] => {
+  return {
+    message: error?.message || 'Unknown error',
+    code: error?.code || 'UNKNOWN',
+    type: error?.type || error?.constructor?.name || 'Error',
+    details: error?.response?.data || error?.response || error
+  }
+}
+
+export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
+  return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
+}
+
+export const getRouterForModel = (model: string): string => {
+  if (model.startsWith('gpt-')) return 'openai'
+  return 'openrouter'
+}
+
+export const getApiKeyForRouter = (router: string): string => {
+  switch (router) {
+    case 'openai':
+      return TEST_CONFIG.openai.key
+    case 'openrouter':
+      return TEST_CONFIG.openrouter.key
+    default:
+      return ''
+  }
+}
+
+export const validateConfig = () => {
+  const missingKeys: string[] = []
+  if (!TEST_CONFIG.openrouter.key) missingKeys.push('OPENROUTER_API_KEY')
+  if (!TEST_CONFIG.openai.key) missingKeys.push('OPENAI_API_KEY')
+  
+  if (missingKeys.length > 0) {
+    throw new Error(`Missing required environment variables: ${missingKeys.join(', ')}`)
+  }
 } 
\ No newline at end of file
diff --git a/packages/kbot/tests/unit/language-report.md b/packages/kbot/tests/unit/language-report.md
index 8d778ee4..61484aa9 100644
--- a/packages/kbot/tests/unit/language-report.md
+++ b/packages/kbot/tests/unit/language-report.md
@@ -2,12 +2,103 @@
 
 ## Failed Tests
 
+### german - deepseek/deepseek-chat:free
+- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
+- Expected: `hallo`
+- Actual: ``
+- Duration: 985ms
+- Error Type: Error
+- Error Code: UNKNOWN
+- Error Message: Model returned empty response
+- Reason: Model returned empty response
+- Timestamp: 4/1/2025, 1:05:50 PM
+
+### german - google/gemini-2.0-flash-exp:free
+- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
+- Expected: `hallo`
+- Actual: ``
+- Duration: 746ms
+- Error Type: Error
+- Error Code: UNKNOWN
+- Error Message: Model returned empty response
+- Reason: Model returned empty response
+- Timestamp: 4/1/2025, 1:05:51 PM
+
+### german - gpt-4
+- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
+- Expected: `hallo`
+- Actual: ``
+- Duration: 1067ms
+- Reason: Unknown error occurred
+- Timestamp: 4/1/2025, 1:05:52 PM
+
+### spanish - deepseek/deepseek-chat:free
+- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
+- Expected: `sí`
+- Actual: ``
+- Duration: 678ms
+- Error Type: Error
+- Error Code: UNKNOWN
+- Error Message: Model returned empty response
+- Reason: Model returned empty response
+- Timestamp: 4/1/2025, 1:05:53 PM
+
+### spanish - google/gemini-2.0-flash-exp:free
+- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
+- Expected: `sí`
+- Actual: ``
+- Duration: 744ms
+- Error Type: Error
+- Error Code: UNKNOWN
+- Error Message: Model returned empty response
+- Reason: Model returned empty response
+- Timestamp: 4/1/2025, 1:05:53 PM
+
+### spanish - gpt-4
+- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
+- Expected: `sí`
+- Actual: ``
+- Duration: 1125ms
+- Reason: Unknown error occurred
+- Timestamp: 4/1/2025, 1:05:55 PM
+
+### french - deepseek/deepseek-chat:free
+- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
+- Expected: `non`
+- Actual: ``
+- Duration: 626ms
+- Error Type: Error
+- Error Code: UNKNOWN
+- Error Message: Model returned empty response
+- Reason: Model returned empty response
+- Timestamp: 4/1/2025, 1:05:55 PM
+
+### french - gpt-4
+- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
+- Expected: `non`
+- Actual: ``
+- Duration: 1341ms
+- Reason: Unknown error occurred
+- Timestamp: 4/1/2025, 1:05:57 PM
+
+### french - google/gemini-2.0-flash-exp:free
+- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
+- Expected: `non`
+- Actual: ``
+- Duration: 729ms
+- Error Type: Error
+- Error Code: UNKNOWN
+- Error Message: Model returned empty response
+- Reason: Model returned empty response
+- Timestamp: 4/1/2025, 1:05:56 PM
+
 ## Passed Tests
 
 ### german_translation - deepseek/deepseek-chat:free
 - Prompt: `translate "hello" to German. Return only the translation, no explanation.`
 - Expected: `hallo`
 - Actual: `Hallo`
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:01 PM
 
 ### german_translation - google/gemini-2.0-flash-exp:free
@@ -15,18 +106,21 @@
 - Expected: `hallo`
 - Actual: `Hallo
 `
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:02 PM
 
 ### german_translation - gpt-4
 - Prompt: `translate "hello" to German. Return only the translation, no explanation.`
 - Expected: `hallo`
 - Actual: `Hallo`
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:32 PM
 
 ### spanish_translation - deepseek/deepseek-chat:free
 - Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
 - Expected: `sí`
 - Actual: `sí`
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:05 PM
 
 ### spanish_translation - google/gemini-2.0-flash-exp:free
@@ -34,18 +128,21 @@
 - Expected: `sí`
 - Actual: `sí
 `
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:06 PM
 
 ### spanish_translation - gpt-4
 - Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
 - Expected: `sí`
 - Actual: `sí`
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:35 PM
 
 ### french_translation - deepseek/deepseek-chat:free
 - Prompt: `translate "no" to French. Return only the translation, no explanation.`
 - Expected: `non`
 - Actual: `non`
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:08 PM
 
 ### french_translation - google/gemini-2.0-flash-exp:free
@@ -53,61 +150,13 @@
 - Expected: `non`
 - Actual: `non
 `
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:10 PM
 
 ### french_translation - gpt-4
 - Prompt: `translate "no" to French. Return only the translation, no explanation.`
 - Expected: `non`
 - Actual: `non`
+- Duration: undefinedms
 - Timestamp: 4/1/2025, 12:56:37 PM
 
-### german - deepseek/deepseek-chat:free
-- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
-- Expected: `hallo`
-- Actual: `Hallo`
-- Timestamp: 4/1/2025, 12:59:07 PM
-
-### german - google/gemini-2.0-flash-exp:free
-- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
-- Expected: `hallo`
-- Actual: `Hallo
-`
-- Timestamp: 4/1/2025, 12:59:09 PM
-
-### german - gpt-4
-- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
-- Expected: `hallo`
-- Actual: `Hallo`
-- Timestamp: 4/1/2025, 12:59:11 PM
-
-### spanish - deepseek/deepseek-chat:free
-- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
-- Expected: `sí`
-- Actual: `Sí`
-- Timestamp: 4/1/2025, 12:59:12 PM
-
-### spanish - google/gemini-2.0-flash-exp:free
-- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
-- Expected: `sí`
-- Actual: `Sí
-`
-- Timestamp: 4/1/2025, 12:59:14 PM
-
-### spanish - gpt-4
-- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
-- Expected: `sí`
-- Actual: `sí`
-- Timestamp: 4/1/2025, 12:59:15 PM
-
-### french - deepseek/deepseek-chat:free
-- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
-- Expected: `non`
-- Actual: `non`
-- Timestamp: 4/1/2025, 12:59:18 PM
-
-### french - gpt-4
-- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
-- Expected: `non`
-- Actual: `non`
-- Timestamp: 4/1/2025, 12:59:20 PM
-
diff --git a/packages/kbot/tests/unit/language.json b/packages/kbot/tests/unit/language.json
index 68177172..9fc6d16e 100644
--- a/packages/kbot/tests/unit/language.json
+++ b/packages/kbot/tests/unit/language.json
@@ -526,5 +526,179 @@
     "router": "openrouter",
     "timestamp": "2025-04-01T10:59:20.589Z",
     "passed": true
+  },
+  {
+    "test": "german",
+    "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
+    "result": [
+      "Hallo"
+    ],
+    "expected": "hallo",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:09.890Z",
+    "passed": true
+  },
+  {
+    "test": "spanish",
+    "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
+    "result": [
+      "sí"
+    ],
+    "expected": "sí",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:12.312Z",
+    "passed": true
+  },
+  {
+    "test": "french",
+    "prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
+    "result": [
+      "non"
+    ],
+    "expected": "non",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:14.660Z",
+    "passed": true
+  },
+  {
+    "test": "german",
+    "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "hallo",
+    "model": "deepseek/deepseek-chat:free",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:50.723Z",
+    "passed": false,
+    "duration": 985,
+    "error": {
+      "message": "Model returned empty response",
+      "code": "UNKNOWN",
+      "type": "Error"
+    },
+    "reason": "Model returned empty response"
+  },
+  {
+    "test": "german",
+    "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "hallo",
+    "model": "google/gemini-2.0-flash-exp:free",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:51.471Z",
+    "passed": false,
+    "duration": 746,
+    "error": {
+      "message": "Model returned empty response",
+      "code": "UNKNOWN",
+      "type": "Error"
+    },
+    "reason": "Model returned empty response"
+  },
+  {
+    "test": "german",
+    "prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "hallo",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:52.540Z",
+    "passed": false,
+    "duration": 1067,
+    "reason": "Unknown error occurred"
+  },
+  {
+    "test": "spanish",
+    "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "sí",
+    "model": "deepseek/deepseek-chat:free",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:53.219Z",
+    "passed": false,
+    "duration": 678,
+    "error": {
+      "message": "Model returned empty response",
+      "code": "UNKNOWN",
+      "type": "Error"
+    },
+    "reason": "Model returned empty response"
+  },
+  {
+    "test": "spanish",
+    "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "sí",
+    "model": "google/gemini-2.0-flash-exp:free",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:53.964Z",
+    "passed": false,
+    "duration": 744,
+    "error": {
+      "message": "Model returned empty response",
+      "code": "UNKNOWN",
+      "type": "Error"
+    },
+    "reason": "Model returned empty response"
+  },
+  {
+    "test": "spanish",
+    "prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "sí",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:55.090Z",
+    "passed": false,
+    "duration": 1125,
+    "reason": "Unknown error occurred"
+  },
+  {
+    "test": "french",
+    "prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "non",
+    "model": "deepseek/deepseek-chat:free",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:55.717Z",
+    "passed": false,
+    "duration": 626,
+    "error": {
+      "message": "Model returned empty response",
+      "code": "UNKNOWN",
+      "type": "Error"
+    },
+    "reason": "Model returned empty response"
+  },
+  {
+    "test": "french",
+    "prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "non",
+    "model": "google/gemini-2.0-flash-exp:free",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:56.447Z",
+    "passed": false,
+    "duration": 729,
+    "error": {
+      "message": "Model returned empty response",
+      "code": "UNKNOWN",
+      "type": "Error"
+    },
+    "reason": "Model returned empty response"
+  },
+  {
+    "test": "french",
+    "prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
+    "result": [],
+    "expected": "non",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:05:57.790Z",
+    "passed": false,
+    "duration": 1341,
+    "reason": "Unknown error occurred"
   }
 ]
\ No newline at end of file
diff --git a/packages/kbot/tests/unit/language.test.ts b/packages/kbot/tests/unit/language.test.ts
index 4eed7007..919273ba 100644
--- a/packages/kbot/tests/unit/language.test.ts
+++ b/packages/kbot/tests/unit/language.test.ts
@@ -4,7 +4,18 @@ import * as path from 'node:path'
 import { sync as write } from "@polymech/fs/write"
 import { sync as read } from "@polymech/fs/read"
 import { sync as exists } from "@polymech/fs/exists"
-import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
+import { 
+  models, 
+  TEST_BASE_PATH, 
+  TEST_LOGS_PATH, 
+  TEST_PREFERENCES_PATH, 
+  TEST_TIMEOUT, 
+  TestResult, 
+  formatError, 
+  isEmptyResponse,
+  getRouterForModel,
+  getApiKeyForRouter
+} from './commons'
 
 import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL  } from '../../src/index'
 
@@ -19,127 +30,109 @@ describe('Language Capabilities', () => {
     testResults = Array.isArray(data) ? data : []
   }
 
-  it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
-    const prompt = 'translate "hello" to German. Return only the translated word, no explanation.'
-    const expected = 'hallo'
+  const runTranslationTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
     let model = 'unknown'
     let router = 'unknown'
+    let startTime = Date.now()
+    let error: TestResult['error'] | undefined
     
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
+    try {
+      const result = await Promise.race([
+        run({
+          prompt,
+          mode: 'completion',
+          model: modelName,
+          path: TEST_BASE_PATH,
+          logs: TEST_LOGS_PATH,
+          preferences: TEST_PREFERENCES_PATH,
+          onRun: async (options) => {
+            model = options.model || 'unknown'
+            router = options.router || 'unknown'
+            return options
+          }
+        }),
+        new Promise((_, reject) => 
+          setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
+        )
+      ]) as string[]
+
+      const actual = result?.[0]?.trim()?.toLowerCase() || ''
+      const passed = actual === expected && !isEmptyResponse(result)
+      
+      if (isEmptyResponse(result)) {
+        throw new Error('Model returned empty response')
       }
-    }) as string[]
+      
+      expect(actual).toEqual(expected)
 
-    const actual = result?.[0]?.trim()?.toLowerCase() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
+      return {
+        test: testName,
+        prompt,
+        result: result || [],
+        expected,
+        model,
+        router,
+        timestamp: new Date().toISOString(),
+        passed,
+        duration: Date.now() - startTime,
+        reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
+        config: {
+          router: getRouterForModel(modelName),
+          apiKey: getApiKeyForRouter(getRouterForModel(modelName))
+        }
+      }
+    } catch (e) {
+      error = formatError(e)
+      throw e
+    } finally {
+      const testResult: TestResult = {
+        test: testName,
+        prompt,
+        result: [],
+        expected,
+        model,
+        router,
+        timestamp: new Date().toISOString(),
+        passed: false,
+        duration: Date.now() - startTime,
+        error,
+        reason: error?.message || 'Unknown error occurred',
+        config: {
+          router: getRouterForModel(modelName),
+          apiKey: getApiKeyForRouter(getRouterForModel(modelName))
+        }
+      }
+      
+      testResults.push(testResult)
+      write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+    }
+  }
 
-    // Add test result to array
-    testResults.push({
-      test: 'german',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+  it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
+    await runTranslationTest(
+      'translate "hello" to German. Return only the translated word, no explanation.',
+      'hallo',
+      'german',
+      modelName
+    )
   })
 
   it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => {
-    const prompt = 'translate "yes" to Spanish. Return only the translated word, no explanation.'
-    const expected = 'sí'
-    let model = 'unknown'
-    let router = 'unknown'
-    
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]
-
-    const actual = result?.[0]?.trim()?.toLowerCase() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
-
-    // Add test result to array
-    testResults.push({
-      test: 'spanish',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+    await runTranslationTest(
+      'translate "yes" to Spanish. Return only the translated word, no explanation.',
+      'sí',
+      'spanish',
+      modelName
+    )
   })
 
   it.each(models)('should translate "no" to French with model %s', async (modelName) => {
-    const prompt = 'translate "no" to French. Return only the translated word, no explanation.'
-    const expected = 'non'
-    let model = 'unknown'
-    let router = 'unknown'
-    
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]
-
-    const actual = result?.[0]?.trim()?.toLowerCase() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
-
-    // Add test result to array
-    testResults.push({
-      test: 'french',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+    await runTranslationTest(
+      'translate "no" to French. Return only the translated word, no explanation.',
+      'non',
+      'french',
+      modelName
+    )
   })
 
   it('should generate markdown report', () => {
@@ -163,32 +156,51 @@ describe('Language Capabilities', () => {
     
     // First list failed tests
     report += '## Failed Tests\n\n'
+    let hasFailures = false
     for (const [testName, modelResults] of latestResults) {
       for (const [model, result] of modelResults) {
         if (!result.passed) {
+          hasFailures = true
           report += `### ${testName} - ${model}\n`
           report += `- Prompt: \`${result.prompt}\`\n`
           report += `- Expected: \`${result.expected}\`\n`
           report += `- Actual: \`${result.result[0] || ''}\`\n`
+          report += `- Duration: ${result.duration}ms\n`
+          if (result.error) {
+            report += `- Error Type: ${result.error.type}\n`
+            report += `- Error Code: ${result.error.code}\n`
+            report += `- Error Message: ${result.error.message}\n`
+          }
           report += `- Reason: ${result.reason}\n`
           report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
         }
       }
     }
+    
+    if (!hasFailures) {
+      report += '*No failed tests*\n\n'
+    }
 
     // Then list passed tests
     report += '## Passed Tests\n\n'
+    let hasPassed = false
     for (const [testName, modelResults] of latestResults) {
       for (const [model, result] of modelResults) {
         if (result.passed) {
+          hasPassed = true
           report += `### ${testName} - ${model}\n`
           report += `- Prompt: \`${result.prompt}\`\n`
           report += `- Expected: \`${result.expected}\`\n`
           report += `- Actual: \`${result.result[0] || ''}\`\n`
+          report += `- Duration: ${result.duration}ms\n`
           report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
         }
       }
     }
+    
+    if (!hasPassed) {
+      report += '*No passed tests*\n\n'
+    }
 
     // Write report to file
     const reportPath = path.resolve(__dirname, './language-report.md')
diff --git a/packages/kbot/tests/unit/math-report.md b/packages/kbot/tests/unit/math-report.md
index 8590997b..df14d555 100644
--- a/packages/kbot/tests/unit/math-report.md
+++ b/packages/kbot/tests/unit/math-report.md
@@ -21,7 +21,7 @@
 - Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
 - Expected: `120`
 - Actual: `120`
-- Timestamp: 4/1/2025, 12:59:11 PM
+- Timestamp: 4/1/2025, 1:03:25 PM
 
 ### fibonacci - deepseek/deepseek-chat:free
 - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
@@ -40,7 +40,7 @@
 - Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
 - Expected: `0,1,1,2,3`
 - Actual: `0, 1, 1, 2, 3`
-- Timestamp: 4/1/2025, 12:59:16 PM
+- Timestamp: 4/1/2025, 1:03:27 PM
 
 ### quadratic - deepseek/deepseek-chat:free
 - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
@@ -57,6 +57,6 @@
 ### quadratic - gpt-4
 - Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
 - Expected: `[-3,-2]`
-- Actual: `["-2", "-3"]`
-- Timestamp: 4/1/2025, 12:59:20 PM
+- Actual: `[-2, -3]`
+- Timestamp: 4/1/2025, 1:03:30 PM
 
diff --git a/packages/kbot/tests/unit/math.json b/packages/kbot/tests/unit/math.json
index 9803f2e7..99c7071d 100644
--- a/packages/kbot/tests/unit/math.json
+++ b/packages/kbot/tests/unit/math.json
@@ -984,5 +984,41 @@
     "router": "openrouter",
     "timestamp": "2025-04-01T10:59:20.963Z",
     "passed": true
+  },
+  {
+    "test": "factorial",
+    "prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
+    "result": [
+      "120"
+    ],
+    "expected": "120",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:25.009Z",
+    "passed": true
+  },
+  {
+    "test": "fibonacci",
+    "prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
+    "result": [
+      "0, 1, 1, 2, 3"
+    ],
+    "expected": "0,1,1,2,3",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:27.826Z",
+    "passed": true
+  },
+  {
+    "test": "quadratic",
+    "prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
+    "result": [
+      "[-2, -3]"
+    ],
+    "expected": "[-3,-2]",
+    "model": "gpt-4",
+    "router": "openrouter",
+    "timestamp": "2025-04-01T11:03:30.579Z",
+    "passed": true
   }
 ]
\ No newline at end of file
diff --git a/packages/kbot/tests/unit/math.test.ts b/packages/kbot/tests/unit/math.test.ts
index cbbd585b..60f6a5fe 100644
--- a/packages/kbot/tests/unit/math.test.ts
+++ b/packages/kbot/tests/unit/math.test.ts
@@ -4,13 +4,24 @@ import * as path from 'node:path'
 import { sync as write } from "@polymech/fs/write"
 import { sync as read } from "@polymech/fs/read"
 import { sync as exists } from "@polymech/fs/exists"
-import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
+import { 
+  models, 
+  TEST_BASE_PATH, 
+  TEST_LOGS_PATH, 
+  TEST_PREFERENCES_PATH, 
+  TEST_TIMEOUT, 
+  TestResult, 
+  formatError, 
+  isEmptyResponse,
+  getRouterForModel,
+  getApiKeyForRouter
+} from './commons'
 
 import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL  } from '../../src/index'
 
 const TEST_LOG_PATH = path.resolve(__dirname, './math.json')
 
-describe('Advanced Math Operations', () => {
+describe('Math Capabilities', () => {
   let testResults: TestResult[] = []
   
   // Load existing results if any
@@ -19,150 +30,109 @@ describe('Advanced Math Operations', () => {
     testResults = Array.isArray(data) ? data : []
   }
 
-  it.each(models)('should calculate factorial of 5 with model %s', async (modelName) => {
-    const prompt = 'calculate the factorial of 5 (5!). Return only the number, no explanation.'
-    const expected = '120'
+  const runMathTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
     let model = 'unknown'
     let router = 'unknown'
+    let startTime = Date.now()
+    let error: TestResult['error'] | undefined
     
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]
-
-    const actual = result?.[0]?.trim() || ''
-    const passed = actual === expected
-    expect(actual).toEqual(expected)
-
-    // Add test result to array
-    testResults.push({
-      test: 'factorial',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
-  })
-
-  it.each(models)('should calculate fibonacci sequence up to 5th number with model %s', async (modelName) => {
-    const prompt = 'calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.'
-    const expected = '0,1,1,2,3'
-    let model = 'unknown'
-    let router = 'unknown'
-    
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]    
-
-    // Handle both formats: "0,1,1,2,3" and "0, 1, 1, 2, 3"
-    const numbers = result?.[0]?.trim()?.split(',')?.map(n => n.trim()) || []
-    const actual = numbers.join(',')
-    const passed = actual === expected
-    expect(numbers).toEqual(['0', '1', '1', '2', '3'])
-
-    // Add test result to array
-    testResults.push({
-      test: 'fibonacci',
-      prompt,
-      result: result || [],
-      expected,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
-    })
-
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
-  })
-
-  it.each(models)('should solve quadratic equation x² + 5x + 6 = 0 with model %s', async (modelName) => {
-    const prompt = 'solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.'
-    const expectedDisplay = '[-3,-2]'
-    let model = 'unknown'
-    let router = 'unknown'
-    
-    const result = await run({
-      prompt,
-      mode: 'completion',
-      model: modelName,
-      filters: 'code',
-      path: TEST_BASE_PATH,
-      logs: TEST_LOGS_PATH,
-      preferences: TEST_PREFERENCES_PATH,
-      onRun: async (options) => {
-        model = options.model || 'unknown'
-        router = options.router || 'unknown'
-        return options
-      }
-    }) as string[]    
-
-    // Parse the result as JSON (markdown already stripped by filter)
-    let jsonResult: number[]
     try {
-      const resultStr = result?.[0]?.trim() || '[]'
-      if (!resultStr) {
-        throw new Error('No result returned from model')
+      const result = await Promise.race([
+        run({
+          prompt,
+          mode: 'completion',
+          model: modelName,
+          path: TEST_BASE_PATH,
+          logs: TEST_LOGS_PATH,
+          preferences: TEST_PREFERENCES_PATH,
+          onRun: async (options) => {
+            model = options.model || 'unknown'
+            router = options.router || 'unknown'
+            return options
+          }
+        }),
+        new Promise((_, reject) => 
+          setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
+        )
+      ]) as string[]
+
+      const actual = result?.[0]?.trim()?.toLowerCase() || ''
+      const passed = actual === expected && !isEmptyResponse(result)
+      
+      if (isEmptyResponse(result)) {
+        throw new Error('Model returned empty response')
       }
-      jsonResult = JSON.parse(resultStr)
-      if (!Array.isArray(jsonResult)) {
-        throw new Error('Result is not an array')
+      
+      expect(actual).toEqual(expected)
+
+      return {
+        test: testName,
+        prompt,
+        result: result || [],
+        expected,
+        model,
+        router,
+        timestamp: new Date().toISOString(),
+        passed,
+        duration: Date.now() - startTime,
+        reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
+        config: {
+          router: getRouterForModel(modelName),
+          apiKey: getApiKeyForRouter(getRouterForModel(modelName))
+        }
       }
-      // Convert any string numbers to actual numbers
-      jsonResult = jsonResult.map(n => typeof n === 'string' ? parseFloat(n) : n)
-    } catch (error) {
-      // If parsing fails, try to extract numbers from the string
-      const numbers = result?.[0]?.match(/-?\d+/g)?.map(n => parseInt(n, 10)) || []
-      jsonResult = numbers
+    } catch (e) {
+      error = formatError(e)
+      throw e
+    } finally {
+      const testResult: TestResult = {
+        test: testName,
+        prompt,
+        result: [],
+        expected,
+        model,
+        router,
+        timestamp: new Date().toISOString(),
+        passed: false,
+        duration: Date.now() - startTime,
+        error,
+        reason: error?.message || 'Unknown error occurred',
+        config: {
+          router: getRouterForModel(modelName),
+          apiKey: getApiKeyForRouter(getRouterForModel(modelName))
+        }
+      }
+      
+      testResults.push(testResult)
+      write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
     }
+  }
 
-    const actual = JSON.stringify(jsonResult.sort())
-    const expectedSorted = JSON.stringify([-3, -2].sort())
-    const passed = actual === expectedSorted
-    expect(jsonResult.sort()).toEqual([-3, -2].sort())
+  it.each(models)('should add two numbers with model %s', async (modelName) => {
+    await runMathTest(
+      'add 5 and 3. Return only the number, no explanation.',
+      '8',
+      'addition',
+      modelName
+    )
+  })
 
-    // Add test result to array
-    testResults.push({
-      test: 'quadratic',
-      prompt,
-      result: result || [],
-      expected: expectedDisplay,
-      model,
-      router,
-      timestamp: new Date().toISOString(),
-      passed,
-      reason: !result?.[0] ? 'No result returned from model' : passed ? undefined : `Expected ${expectedDisplay}, but got ${result?.[0] || ''}`
-    })
+  it.each(models)('should multiply two numbers with model %s', async (modelName) => {
+    await runMathTest(
+      'multiply 8 and 3. Return only the number, no explanation.',
+      '24',
+      'multiplication',
+      modelName
+    )
+  })
 
-    // Write all results to the same file
-    write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
+  it.each(models)('should divide two numbers with model %s', async (modelName) => {
+    await runMathTest(
+      'divide 15 by 3. Return only the number, no explanation.',
+      '5',
+      'division',
+      modelName
+    )
   })
 
   it('should generate markdown report', () => {
@@ -186,32 +156,51 @@ describe('Advanced Math Operations', () => {
     
     // First list failed tests
     report += '## Failed Tests\n\n'
+    let hasFailures = false
     for (const [testName, modelResults] of latestResults) {
       for (const [model, result] of modelResults) {
         if (!result.passed) {
+          hasFailures = true
           report += `### ${testName} - ${model}\n`
           report += `- Prompt: \`${result.prompt}\`\n`
           report += `- Expected: \`${result.expected}\`\n`
           report += `- Actual: \`${result.result[0] || ''}\`\n`
+          report += `- Duration: ${result.duration}ms\n`
+          if (result.error) {
+            report += `- Error Type: ${result.error.type}\n`
+            report += `- Error Code: ${result.error.code}\n`
+            report += `- Error Message: ${result.error.message}\n`
+          }
           report += `- Reason: ${result.reason}\n`
           report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
         }
       }
     }
+    
+    if (!hasFailures) {
+      report += '*No failed tests*\n\n'
+    }
 
     // Then list passed tests
     report += '## Passed Tests\n\n'
+    let hasPassed = false
     for (const [testName, modelResults] of latestResults) {
       for (const [model, result] of modelResults) {
         if (result.passed) {
+          hasPassed = true
           report += `### ${testName} - ${model}\n`
           report += `- Prompt: \`${result.prompt}\`\n`
           report += `- Expected: \`${result.expected}\`\n`
           report += `- Actual: \`${result.result[0] || ''}\`\n`
+          report += `- Duration: ${result.duration}ms\n`
           report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
         }
       }
     }
+    
+    if (!hasPassed) {
+      report += '*No passed tests*\n\n'
+    }
 
     // Write report to file
     const reportPath = path.resolve(__dirname, './math-report.md')