fix:response format for tests

2025-06-06 00:35:53 +02:00 · 2025-06-06 00:35:53 +02:00 · 1535cb754a
commit 1535cb754a
parent d9dc88b972
15 changed files with 4661 additions and 283 deletions
--- a/packages/kbot/.vscode/launch.json
+++ b/packages/kbot/.vscode/launch.json
@ -4,6 +4,23 @@
 	// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 	"version": "0.2.0",
 	"configurations": [
+		{
+			"type": "node",
+			"request": "launch",
+			"name": "Vitest: Debug Open File",
+			"program": "${workspaceFolder}/node_modules/vitest/vitest.mjs",
+			"args": [
+				"run",
+				"${relativeFile}"
+			],
+			"skipFiles": [
+				"<node_internals>/**"
+			],
+			"console": "integratedTerminal",
+			"sourceMaps": true,
+			"smartStep": true,
+			"internalConsoleOptions": "neverOpen"
+		},
 		{
 			"type": "node",
 			"request": "launch",
@ -768,5 +785,6 @@
 			"console": "integratedTerminal",
 			"outputCapture": "std"
 		}
-	]
+	],
+	"compounds": []
 }
--- a/packages/kbot/dist-in/commands/run.js
+++ b/packages/kbot/dist-in/commands/run.js
--- a/packages/kbot/logs/params.json
+++ b/packages/kbot/logs/params.json
@ -3,7 +3,7 @@
  "messages": [
    {
      "role": "user",
-      "content": "multiply 8 and 3. Return only the number, no explanation."
+      "content": "Provide a synonym for \"happy\". Return only the synonym, no explanation."
    },
    {
      "role": "user",
--- a/packages/kbot/package-lock.json
+++ b/packages/kbot/package-lock.json
@ -9,6 +9,7 @@
      "version": "0.3.5",
      "license": "MIT",
      "dependencies": {
+        "@dmitryrechkin/json-schema-to-zod": "1.0.1",
        "@polymech/ai-tools": "file:../ai-tools",
        "@polymech/cache": "file:../cache",
        "@polymech/commons": "file:../commons",
@ -400,6 +401,14 @@
        "node": ">=14.17.0"
      }
    },
+    "node_modules/@dmitryrechkin/json-schema-to-zod": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/@dmitryrechkin/json-schema-to-zod/-/json-schema-to-zod-1.0.1.tgz",
+      "integrity": "sha512-cG9gC4NMu/7JZqmRZy6uIb+l+kxek2GFQ0/qrhw7xeFK2l5B9yF9FVuujoqFPLRGDHNFYqtBWht7hY4KB0ngrA==",
+      "dependencies": {
+        "zod": "^3.23.8"
+      }
+    },
    "node_modules/@esbuild/aix-ppc64": {
      "version": "0.21.5",
      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
--- a/packages/kbot/package.json
+++ b/packages/kbot/package.json
@ -53,6 +53,7 @@
    "examples:iterator-markdown:no-cache": "node dist-in/examples/core/iterator-markdown-example.js --no-cache"
  },
  "dependencies": {
+    "@dmitryrechkin/json-schema-to-zod": "1.0.1",
    "@polymech/ai-tools": "file:../ai-tools",
    "@polymech/cache": "file:../cache",
    "@polymech/commons": "file:../commons",
--- a/packages/kbot/src/commands/run.ts
+++ b/packages/kbot/src/commands/run.ts
@ -88,6 +88,7 @@ export const complete_options = async (opts: IKBotTask): Promise<IKBotTask | nul
    options.client = client
    options.variables = { ...options.variables, ...variables(options) }
    options.collector = collector(options, client)
+    options.format = opts.format
    options.onRun = options.onRun || (async (options) => options)

    return options
@ -174,7 +175,6 @@ export const complete_params = async (
  if (options.mode === E_Mode.TOOLS || options.mode === E_Mode.ASSISTANT) {
    params.tools = await loadTools(options)
    params.tool_choice = 'auto'
-    //params.parallel_tool_calls = false
  }

  return params
--- a/packages/kbot/tests/unit/commons.ts
+++ b/packages/kbot/tests/unit/commons.ts
@ -5,16 +5,46 @@ import { sync as write } from "@polymech/fs/write"
 import { sync as read } from "@polymech/fs/read"
 import { sync as exists } from "@polymech/fs/exists"
 import { sync as mkdirp } from "mkdirp"
-
+import { zodResponseFormat } from "openai/helpers/zod"
+import { JSONSchemaToZod } from '@dmitryrechkin/json-schema-to-zod';
 export enum ModelCategory {
  FAST = 'fast',
  LANGUAGE = 'language',
  TOOL = 'tool',
  ALL = 'all',
  CODING = 'coding',
-  FILES = 'file'
+  FILES = 'file',
+  TEST_EQUAL = ''
 }

+export enum EqualityCheck {
+  DEFAULT = 'default',
+  JSON_EQUAL = 'json_equal',
+  LLM_EQUAL = 'llm_equal',
+  NONE = 'none'
+}
+
+export type EqualityFn = (actual: string, expected: string) => Promise<boolean>;
+
+export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
+  [EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
+    return (actual || '').trim().toLowerCase() === (expected || '').trim().toLowerCase();
+  },
+  [EqualityCheck.JSON_EQUAL]: async (actual: string, expected: string): Promise<boolean> => {
+    try {
+      // we just stringify to normalize and compare
+      const actualJson = JSON.parse(actual.trim());
+      const expectedJson = JSON.parse(expected.trim());
+      return JSON.stringify(actualJson) === JSON.stringify(expectedJson);
+    } catch (e) {
+      return false;
+    }
+  },
+  [EqualityCheck.NONE]: async (): Promise<boolean> => {
+    return true;
+  },
+};
+
 export const getFastModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
@ -23,6 +53,12 @@ export const getFastModels = (): string[] => {
  ]
 }

+export const getTestEqualModels = (): string[] => {
+  return [
+    E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
+  ]
+}
+
 export const getCodingModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_NVIDIA_LLAMA_3_3_NEMOTRON_SUPER_49B_V1_FREE
@ -31,8 +67,7 @@ export const getCodingModels = (): string[] => {

 export const getFileModels = (): string[] => {
  return [
-    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
-    E_OPENROUTER_MODEL.MODEL_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE
+    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
  ]
 }

@ -68,6 +103,8 @@ export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST):
      return getCodingModels()
    case ModelCategory.FILES:
      return getFileModels()
+    case ModelCategory.TEST_EQUAL:
+      return getTestEqualModels()
    case ModelCategory.ALL:
    default:
      return [
@ -184,6 +221,11 @@ export const runTest = async (
  let defaultOptions = {
    filters: 'code'
  }
+  let format: any = null
+  if (options.format) {
+    const zodSchema = JSONSchemaToZod.convert(options.format);
+    format = zodResponseFormat(zodSchema, "format");
+  }
  try {
    const result = await Promise.race([
      run({
@ -194,7 +236,7 @@ export const runTest = async (
        logs: TEST_LOGS_PATH,
        preferences: TEST_PREFERENCES_PATH,
        logLevel: 2,
-        ...{ ...defaultOptions, ...options },
+        ...{ ...defaultOptions, ...options, format },
        onRun: async (options) => {
          model = options.model || 'unknown'
          router = options.model as string
@ -220,8 +262,10 @@ export const runTest = async (
        reason: 'Model returned empty response'
      }
    } else {
-      const actual = result?.[0]?.trim()?.toLowerCase() || ''
-      const passed = actual === expected
+      const actual = result?.[0] || ''
+      const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
+      const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
+      const passed = await checkFn(actual, expected)

      testResult = {
        test: testName,
@ -233,7 +277,7 @@ export const runTest = async (
        timestamp: new Date().toISOString(),
        passed,
        duration: Date.now() - startTime,
-        reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
+        reason: passed ? undefined : `Expected ${expected}, but got ${actual.trim()}`,
      }
    }
  } catch (e) {
--- a/packages/kbot/tests/unit/files.test.ts
+++ b/packages/kbot/tests/unit/files.test.ts
@ -13,7 +13,8 @@ import {
    runTest,
    generateTestReport,
    getReportPaths,
-    ModelCategory
+    ModelCategory,
+    EqualityCheck
 } from './commons'
 import { IKBotOptions } from '@polymech/commons'

@ -25,6 +26,42 @@ describe('File Operations', () => {
    const TEST_LOG_PATH = getReportPaths('files', 'json')
    const TEST_REPORT_PATH = getReportPaths('files', 'md')
   
+    it.each(models)('should identify animals in image files with model %s', async (modelName) => {
+        const result = await runTest(
+            'What animals are shown in these images?',
+            '["cat","fox"]',
+            'file-inclusion',
+            modelName,
+            TEST_LOG_PATH,
+            'completion',
+            {
+                include: ['tests/test-data/files/lazyfox.jpg', 'tests/test-data/files/cat.jpg'],
+                logLevel: 2,
+                equalityCheck: EqualityCheck.NONE,
+                format: {
+                    type: "object",
+                    properties: {
+                        animals: {
+                            type: "array",
+                            items: {
+                                type: "string"
+                            },
+                            minItems: 2,
+                            maxItems: 2
+                        }
+                    },
+                    required: ["animals"]
+                }
+            } as IKBotOptions
+        )
+        testResults.push(result)        
+        const parsedResult = JSON.parse(result.result[0]?.trim())
+        const animals = parsedResult.animals.map((s: string) => s.toLowerCase())
+        expect(animals).toContain('cat')
+        expect(animals).toContain('fox')
+        expect(animals.length).toBe(2)
+    }, { timeout: TEST_TIMEOUT })
+
    it.each(models)('should process single file with model %s', async (modelName) => {
        const result = await runTest(
            'What is the name of the algorithm implemented in these files? Return only the name.',
@ -53,6 +90,7 @@ describe('File Operations', () => {
            'completion',
            {
                include: ['./tests/test-data/files/*.js'],
+                equalityCheck: EqualityCheck.NONE,
                logLevel: 2,
                format: {
                    type: "object",
@ -70,11 +108,12 @@ describe('File Operations', () => {
                }
            } as IKBotOptions
        )
-        testResults.push(result)        
+        testResults.push(result)   
        const parsedResult = JSON.parse(result.result[0]?.trim())
-        expect(parsedResult.includes('bubble'))
-        expect(parsedResult.includes('factorial'))
-        expect(parsedResult).toHaveLength(2)
+        const algorithms = parsedResult.algorithms.map((s: string) => s.toLowerCase())
+        expect(algorithms.some(a => a.includes('bubble'))).toBe(true)
+        expect(algorithms.some(a => a.includes('factorial'))).toBe(true)
+        expect(algorithms).toHaveLength(2)
    }, { timeout: TEST_TIMEOUT })

    it.each(models)('should process files in glob subdirectory with model %s', async (modelName) => {
@ -84,46 +123,13 @@ describe('File Operations', () => {
            'file-inclusion',
            modelName,
            TEST_LOG_PATH,
-            'completion',
+            'completion',            
            { include: ['tests/test-data/files/glob/data.json'] }
        )
        testResults.push(result)
        expect(result.result[0]?.trim()).toBe('Injection Barrel')
    }, { timeout: TEST_TIMEOUT })

-    it.each(models)('should identify animals in image files with model %s', async (modelName) => {
-        const result = await runTest(
-            'What animals are shown in these images? Return as JSON array.',
-            '["cat","fox"]',
-            'file-inclusion',
-            modelName,
-            TEST_LOG_PATH,
-            'completion',
-            {
-                include: ['tests/test-data/files/lazyfox.jpg'],
-                logLevel: 2,
-                format: {
-                    type: "object",
-                    properties: {
-                        animals: {
-                            type: "array",
-                            items: {
-                                type: "string"
-                            },
-                            minItems: 2,
-                            maxItems: 2
-                        }
-                    },
-                    required: ["animals"]
-                }
-            } as IKBotOptions
-        )
-        testResults.push(result)
-        const parsedResult = JSON.parse(result.result[0]?.trim())
-        expect(parsedResult.includes('cat'))
-        expect(parsedResult.includes('fox'))
-        expect(parsedResult.length==2)
-    }, { timeout: TEST_TIMEOUT })

    it('should generate markdown report', () => {
        generateTestReport(testResults, 'File Operations Test Results', TEST_REPORT_PATH)
--- a/packages/kbot/tests/unit/reports/all.json
+++ b/packages/kbot/tests/unit/reports/all.json
--- a/packages/kbot/tests/unit/reports/basic.json
+++ b/packages/kbot/tests/unit/reports/basic.json
@ -180,6 +180,173 @@
      "passed": true,
      "duration": 626,
      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T21:19:15.716Z",
+      "passed": true,
+      "duration": 2024,
+      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T21:19:16.361Z",
+      "passed": true,
+      "duration": 641,
+      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T21:19:20.162Z",
+      "passed": true,
+      "duration": 3798,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T21:19:21.917Z",
+      "passed": true,
+      "duration": 1752,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T21:19:22.504Z",
+      "passed": true,
+      "duration": 585,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T21:19:25.779Z",
+      "passed": true,
+      "duration": 3272,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T21:19:27.557Z",
+      "passed": true,
+      "duration": 1775,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T21:19:28.041Z",
+      "passed": true,
+      "duration": 481,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T21:19:31.450Z",
+      "passed": true,
+      "duration": 3406,
+      "category": "basic"
+    },
+    {
+      "test": "web_content",
+      "prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
+      "result": [
+        "yes"
+      ],
+      "expected": "yes",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T21:19:37.473Z",
+      "passed": true,
+      "duration": 6020,
+      "category": "basic"
+    },
+    {
+      "test": "web_content",
+      "prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
+      "result": [
+        "yes"
+      ],
+      "expected": "yes",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T21:19:42.394Z",
+      "passed": true,
+      "duration": 4917,
+      "category": "basic"
+    },
+    {
+      "test": "web_content",
+      "prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
+      "result": [],
+      "expected": "yes",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T21:19:47.544Z",
+      "passed": false,
+      "duration": 5147,
+      "reason": "Model returned empty response",
+      "category": "basic"
    }
  ],
  "highscores": [
@ -188,8 +355,8 @@
      "rankings": [
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 738,
-          "duration_secs": 0.738
+          "duration": 641,
+          "duration_secs": 0.641
        },
        {
          "model": "openai/gpt-3.5-turbo",
@ -201,30 +368,30 @@
    {
      "test": "multiplication",
      "rankings": [
+        {
+          "model": "openai/gpt-4o-mini",
+          "duration": 585,
+          "duration_secs": 0.585
+        },
        {
          "model": "openai/gpt-3.5-turbo",
          "duration": 624,
          "duration_secs": 0.624
-        },
-        {
-          "model": "openai/gpt-4o-mini",
-          "duration": 626,
-          "duration_secs": 0.626
        }
      ]
    },
    {
      "test": "division",
      "rankings": [
+        {
+          "model": "openai/gpt-4o-mini",
+          "duration": 481,
+          "duration_secs": 0.481
+        },
        {
          "model": "openai/gpt-3.5-turbo",
          "duration": 513,
          "duration_secs": 0.513
-        },
-        {
-          "model": "openai/gpt-4o-mini",
-          "duration": 895,
-          "duration_secs": 0.895
        }
      ]
    },
@ -238,11 +405,11 @@
        },
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 4358,
-          "duration_secs": 4.358
+          "duration": 4917,
+          "duration_secs": 4.917
        }
      ]
    }
  ],
-  "lastUpdated": "2025-06-05T18:56:45.466Z"
+  "lastUpdated": "2025-06-05T21:19:47.545Z"
 }
--- a/packages/kbot/tests/unit/reports/basic.md
+++ b/packages/kbot/tests/unit/reports/basic.md
@ -6,89 +6,125 @@

 | Test | Model | Duration (ms) | Duration (s) |
 |------|-------|--------------|--------------|
-| addition | openai/gpt-4o-mini | 514 | 0.51 |
-| addition | openai/gpt-3.5-turbo | 771 | 0.77 |
-| multiplication | openai/gpt-3.5-turbo | 624 | 0.62 |
-| multiplication | openai/gpt-4o-mini | 721 | 0.72 |
-| division | openai/gpt-3.5-turbo | 513 | 0.51 |
-| division | openai/gpt-4o-mini | 895 | 0.90 |
-| web_content | openai/gpt-3.5-turbo | 220 | 0.22 |
-| web_content | openai/gpt-4o-mini | 4358 | 4.36 |
+| addition | openai/gpt-4o-mini | 641 | 0.64 |
+| addition | anthropic/claude-sonnet-4 | 2024 | 2.02 |
+| addition | deepseek/deepseek-r1:free | 3798 | 3.80 |
+| multiplication | openai/gpt-4o-mini | 585 | 0.58 |
+| multiplication | anthropic/claude-sonnet-4 | 1752 | 1.75 |
+| multiplication | deepseek/deepseek-r1:free | 3272 | 3.27 |
+| division | openai/gpt-4o-mini | 481 | 0.48 |
+| division | anthropic/claude-sonnet-4 | 1775 | 1.77 |
+| division | deepseek/deepseek-r1:free | 3406 | 3.41 |
+| web_content | openai/gpt-4o-mini | 4917 | 4.92 |
+| web_content | deepseek/deepseek-r1:free | 5147 | 5.15 |
+| web_content | anthropic/claude-sonnet-4 | 6020 | 6.02 |

 ## Summary

- Total Tests: 8
- Passed: 7
+- Total Tests: 12
+- Passed: 11
 - Failed: 1
- Success Rate: 87.50%
- Average Duration: 1077ms (1.08s)
+- Success Rate: 91.67%
+- Average Duration: 2818ms (2.82s)

 ## Failed Tests

-### web_content - openai/gpt-3.5-turbo
+### web_content - deepseek/deepseek-r1:free

 - Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
 - Expected: `yes`
 - Actual: ``
- Duration: 220ms (0.22s)
+- Duration: 5147ms (5.15s)
 - Reason: Model returned empty response
- Timestamp: 6/5/2025, 8:46:11 PM
+- Timestamp: 6/5/2025, 11:19:47 PM

 ## Passed Tests

-### addition - openai/gpt-3.5-turbo
+### addition - anthropic/claude-sonnet-4

 - Prompt: `add 5 and 3. Return only the number, no explanation.`
 - Expected: `8`
 - Actual: `8`
- Duration: 771ms (0.77s)
- Timestamp: 6/5/2025, 8:46:08 PM
+- Duration: 2024ms (2.02s)
+- Timestamp: 6/5/2025, 11:19:15 PM

 ### addition - openai/gpt-4o-mini

 - Prompt: `add 5 and 3. Return only the number, no explanation.`
 - Expected: `8`
 - Actual: `8`
- Duration: 514ms (0.51s)
- Timestamp: 6/5/2025, 8:46:08 PM
+- Duration: 641ms (0.64s)
+- Timestamp: 6/5/2025, 11:19:16 PM

-### multiplication - openai/gpt-3.5-turbo
+### addition - deepseek/deepseek-r1:free
+
+- Prompt: `add 5 and 3. Return only the number, no explanation.`
+- Expected: `8`
+- Actual: `8`
+- Duration: 3798ms (3.80s)
+- Timestamp: 6/5/2025, 11:19:20 PM
+
+### multiplication - anthropic/claude-sonnet-4

 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
 - Expected: `24`
 - Actual: `24`
- Duration: 624ms (0.62s)
- Timestamp: 6/5/2025, 8:46:09 PM
+- Duration: 1752ms (1.75s)
+- Timestamp: 6/5/2025, 11:19:21 PM

 ### multiplication - openai/gpt-4o-mini

 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
 - Expected: `24`
 - Actual: `24`
- Duration: 721ms (0.72s)
- Timestamp: 6/5/2025, 8:46:09 PM
+- Duration: 585ms (0.58s)
+- Timestamp: 6/5/2025, 11:19:22 PM

-### division - openai/gpt-3.5-turbo
+### multiplication - deepseek/deepseek-r1:free
+
+- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
+- Expected: `24`
+- Actual: `24`
+- Duration: 3272ms (3.27s)
+- Timestamp: 6/5/2025, 11:19:25 PM
+
+### division - anthropic/claude-sonnet-4

 - Prompt: `divide 15 by 3. Return only the number, no explanation.`
 - Expected: `5`
 - Actual: `5`
- Duration: 513ms (0.51s)
- Timestamp: 6/5/2025, 8:46:10 PM
+- Duration: 1775ms (1.77s)
+- Timestamp: 6/5/2025, 11:19:27 PM

 ### division - openai/gpt-4o-mini

 - Prompt: `divide 15 by 3. Return only the number, no explanation.`
 - Expected: `5`
 - Actual: `5`
- Duration: 895ms (0.90s)
- Timestamp: 6/5/2025, 8:46:11 PM
+- Duration: 481ms (0.48s)
+- Timestamp: 6/5/2025, 11:19:28 PM
+
+### division - deepseek/deepseek-r1:free
+
+- Prompt: `divide 15 by 3. Return only the number, no explanation.`
+- Expected: `5`
+- Actual: `5`
+- Duration: 3406ms (3.41s)
+- Timestamp: 6/5/2025, 11:19:31 PM
+
+### web_content - anthropic/claude-sonnet-4
+
+- Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
+- Expected: `yes`
+- Actual: `yes`
+- Duration: 6020ms (6.02s)
+- Timestamp: 6/5/2025, 11:19:37 PM

 ### web_content - openai/gpt-4o-mini

 - Prompt: `Check if the content contains a section about Human prehistory. Reply with "yes" if it does, "no" if it does not.`
 - Expected: `yes`
- Actual: `Yes`
- Duration: 4358ms (4.36s)
- Timestamp: 6/5/2025, 8:46:15 PM
+- Actual: `yes`
+- Duration: 4917ms (4.92s)
+- Timestamp: 6/5/2025, 11:19:42 PM

--- a/packages/kbot/tests/unit/reports/files.json
+++ b/packages/kbot/tests/unit/reports/files.json
--- a/packages/kbot/tests/unit/reports/files.md
+++ b/packages/kbot/tests/unit/reports/files.md
@ -6,44 +6,27 @@

 | Test | Model | Duration (ms) | Duration (s) |
 |------|-------|--------------|--------------|
-| file-inclusion | openai/gpt-4o-mini | 2223 | 2.22 |
-| file-inclusion | google/gemini-2.0-flash-exp:free | 2404 | 2.40 |
+| file-inclusion | openai/gpt-4o | 614 | 0.61 |

 ## Summary

- Total Tests: 8
- Passed: 2
- Failed: 6
- Success Rate: 25.00%
- Average Duration: 1671ms (1.67s)
+- Total Tests: 4
+- Passed: 3
+- Failed: 1
+- Success Rate: 75.00%
+- Average Duration: 1380ms (1.38s)

 ## Failed Tests

-### file-inclusion - openai/gpt-4o-mini
-
- Prompt: `What animals are shown in these images? Return as JSON array.`
- Expected: `["cat","fox"]`
- Actual: `["cat", "fox"]`
- Duration: 2223ms (2.22s)
- Reason: Expected ["cat","fox"], but got ["cat", "fox"]
- Timestamp: 6/5/2025, 8:46:17 PM
-
-### file-inclusion - google/gemini-2.0-flash-exp:free
-
- Prompt: `What animals are shown in these images? Return as JSON array.`
- Expected: `["cat","fox"]`
- Actual: `[
-  "cat",
-  "fox"
-]`
- Duration: 2404ms (2.40s)
- Reason: Expected ["cat","fox"], but got [
-  "cat",
-  "fox"
-]
- Timestamp: 6/5/2025, 8:46:20 PM
+*No failed tests*

 ## Passed Tests

-*No passed tests*
+### file-inclusion - openai/gpt-4o
+
+- Prompt: `What is the title of the product in data.json? Return only the title.`
+- Expected: `Injection Barrel`
+- Actual: `Injection Barrel`
+- Duration: 614ms (0.61s)
+- Timestamp: 6/6/2025, 12:29:46 AM

--- a/packages/kbot/tests/unit/reports/language.json
+++ b/packages/kbot/tests/unit/reports/language.json
@ -1451,21 +1451,237 @@
      "passed": true,
      "duration": 2616,
      "category": "language"
+    },
+    {
+      "test": "translation",
+      "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
+      "result": [
+        "¡Hola, mundo!"
+      ],
+      "expected": "¡Hola, mundo!",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T22:30:21.412Z",
+      "passed": true,
+      "duration": 1560,
+      "category": "language"
+    },
+    {
+      "test": "translation",
+      "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
+      "result": [
+        "¡Hola, mundo!"
+      ],
+      "expected": "¡Hola, mundo!",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T22:30:22.869Z",
+      "passed": true,
+      "duration": 1451,
+      "category": "language"
+    },
+    {
+      "test": "translation",
+      "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
+      "result": [
+        "¡Hola, mundo!"
+      ],
+      "expected": "¡Hola, mundo!",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T22:30:28.307Z",
+      "passed": true,
+      "duration": 5434,
+      "category": "language"
+    },
+    {
+      "test": "grammar",
+      "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
+      "result": [
+        "I went to the store yesterday."
+      ],
+      "expected": "I went to the store yesterday",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T22:30:29.513Z",
+      "passed": false,
+      "duration": 1201,
+      "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
+      "category": "language"
+    },
+    {
+      "test": "grammar",
+      "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
+      "result": [
+        "\"I went to the store yesterday.\""
+      ],
+      "expected": "I went to the store yesterday",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T22:30:30.212Z",
+      "passed": false,
+      "duration": 695,
+      "reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
+      "category": "language"
+    },
+    {
+      "test": "grammar",
+      "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
+      "result": [
+        "I went to the store yesterday."
+      ],
+      "expected": "I went to the store yesterday",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T22:30:33.611Z",
+      "passed": false,
+      "duration": 3395,
+      "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
+      "category": "language"
+    },
+    {
+      "test": "summarization",
+      "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
+      "result": [
+        "A brown fox leaps over a dog."
+      ],
+      "expected": "A fox jumps over a dog",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T22:30:34.920Z",
+      "passed": false,
+      "duration": 1304,
+      "reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
+      "category": "language"
+    },
+    {
+      "test": "summarization",
+      "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
+      "result": [
+        "A fox jumps over a dog."
+      ],
+      "expected": "A fox jumps over a dog",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T22:30:35.620Z",
+      "passed": false,
+      "duration": 692,
+      "reason": "Expected A fox jumps over a dog, but got A fox jumps over a dog.",
+      "category": "language"
+    },
+    {
+      "test": "summarization",
+      "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
+      "result": [
+        "\"A quick brown fox leaps over a dog.\""
+      ],
+      "expected": "A fox jumps over a dog",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T22:30:49.662Z",
+      "passed": false,
+      "duration": 14038,
+      "reason": "Expected A fox jumps over a dog, but got \"A quick brown fox leaps over a dog.\"",
+      "category": "language"
+    },
+    {
+      "test": "language_detection",
+      "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
+      "result": [
+        "French"
+      ],
+      "expected": "French",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T22:30:50.805Z",
+      "passed": true,
+      "duration": 1137,
+      "category": "language"
+    },
+    {
+      "test": "language_detection",
+      "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
+      "result": [
+        "French"
+      ],
+      "expected": "French",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T22:30:51.269Z",
+      "passed": true,
+      "duration": 459,
+      "category": "language"
+    },
+    {
+      "test": "language_detection",
+      "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
+      "result": [
+        "French"
+      ],
+      "expected": "French",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T22:30:55.198Z",
+      "passed": true,
+      "duration": 3924,
+      "category": "language"
+    },
+    {
+      "test": "synonyms",
+      "prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
+      "result": [
+        "Joyful"
+      ],
+      "expected": "joyful",
+      "model": "anthropic/claude-sonnet-4",
+      "router": "anthropic/claude-sonnet-4",
+      "timestamp": "2025-06-05T22:30:56.455Z",
+      "passed": true,
+      "duration": 1251,
+      "category": "language"
+    },
+    {
+      "test": "synonyms",
+      "prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
+      "result": [
+        "Joyful"
+      ],
+      "expected": "joyful",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-06-05T22:30:57.083Z",
+      "passed": true,
+      "duration": 622,
+      "category": "language"
+    },
+    {
+      "test": "synonyms",
+      "prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
+      "result": [
+        "joyful"
+      ],
+      "expected": "joyful",
+      "model": "deepseek/deepseek-r1:free",
+      "router": "deepseek/deepseek-r1:free",
+      "timestamp": "2025-06-05T22:31:00.924Z",
+      "passed": true,
+      "duration": 3836,
+      "category": "language"
    }
  ],
  "highscores": [
    {
      "test": "translation",
      "rankings": [
-        {
-          "model": "openai/gpt-4o-mini",
-          "duration": 666,
-          "duration_secs": 0.666
-        },
        {
          "model": "openai/gpt-3.5-turbo",
          "duration": 818,
          "duration_secs": 0.818
+        },
+        {
+          "model": "openai/gpt-4o-mini",
+          "duration": 1451,
+          "duration_secs": 1.451
        }
      ]
    },
@ -1479,8 +1695,8 @@
        },
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 1171,
-          "duration_secs": 1.171
+          "duration": 695,
+          "duration_secs": 0.695
        }
      ]
    },
@ -1494,41 +1710,41 @@
        },
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 699,
-          "duration_secs": 0.699
+          "duration": 692,
+          "duration_secs": 0.692
        }
      ]
    },
    {
      "test": "language_detection",
      "rankings": [
+        {
+          "model": "openai/gpt-4o-mini",
+          "duration": 459,
+          "duration_secs": 0.459
+        },
        {
          "model": "openai/gpt-3.5-turbo",
          "duration": 695,
          "duration_secs": 0.695
-        },
-        {
-          "model": "openai/gpt-4o-mini",
-          "duration": 776,
-          "duration_secs": 0.776
        }
      ]
    },
    {
      "test": "synonyms",
      "rankings": [
-        {
-          "model": "openai/gpt-4o-mini",
-          "duration": 548,
-          "duration_secs": 0.548
-        },
        {
          "model": "openai/gpt-3.5-turbo",
          "duration": 570,
          "duration_secs": 0.57
+        },
+        {
+          "model": "openai/gpt-4o-mini",
+          "duration": 622,
+          "duration_secs": 0.622
        }
      ]
    }
  ],
-  "lastUpdated": "2025-06-05T18:56:08.627Z"
+  "lastUpdated": "2025-06-05T22:31:00.924Z"
 }
--- a/packages/kbot/tests/unit/reports/language.md
+++ b/packages/kbot/tests/unit/reports/language.md
@ -6,164 +6,157 @@

 | Test | Model | Duration (ms) | Duration (s) |
 |------|-------|--------------|--------------|
-| translation | openai/gpt-4o-mini | 666 | 0.67 |
-| translation | anthropic/claude-sonnet-4 | 1317 | 1.32 |
-| translation | deepseek/deepseek-r1:free | 5397 | 5.40 |
-| grammar | openai/gpt-4o-mini | 1171 | 1.17 |
-| grammar | anthropic/claude-sonnet-4 | 1722 | 1.72 |
-| grammar | deepseek/deepseek-r1:free | 5199 | 5.20 |
-| summarization | openai/gpt-4o-mini | 699 | 0.70 |
-| summarization | anthropic/claude-sonnet-4 | 1820 | 1.82 |
-| summarization | deepseek/deepseek-r1:free | 7380 | 7.38 |
-| language_detection | openai/gpt-4o-mini | 776 | 0.78 |
-| language_detection | anthropic/claude-sonnet-4 | 1725 | 1.73 |
-| language_detection | deepseek/deepseek-r1:free | 5247 | 5.25 |
-| synonyms | openai/gpt-4o-mini | 548 | 0.55 |
-| synonyms | anthropic/claude-sonnet-4 | 1967 | 1.97 |
-| synonyms | deepseek/deepseek-r1:free | 2616 | 2.62 |
+| translation | openai/gpt-4o-mini | 1451 | 1.45 |
+| translation | anthropic/claude-sonnet-4 | 1560 | 1.56 |
+| translation | deepseek/deepseek-r1:free | 5434 | 5.43 |
+| grammar | openai/gpt-4o-mini | 695 | 0.69 |
+| grammar | anthropic/claude-sonnet-4 | 1201 | 1.20 |
+| grammar | deepseek/deepseek-r1:free | 3395 | 3.40 |
+| summarization | openai/gpt-4o-mini | 692 | 0.69 |
+| summarization | anthropic/claude-sonnet-4 | 1304 | 1.30 |
+| summarization | deepseek/deepseek-r1:free | 14038 | 14.04 |
+| language_detection | openai/gpt-4o-mini | 459 | 0.46 |
+| language_detection | anthropic/claude-sonnet-4 | 1137 | 1.14 |
+| language_detection | deepseek/deepseek-r1:free | 3924 | 3.92 |
+| synonyms | openai/gpt-4o-mini | 622 | 0.62 |
+| synonyms | anthropic/claude-sonnet-4 | 1251 | 1.25 |
+| synonyms | deepseek/deepseek-r1:free | 3836 | 3.84 |

 ## Summary

 - Total Tests: 15
- Passed: 2
- Failed: 13
- Success Rate: 13.33%
- Average Duration: 2550ms (2.55s)
+- Passed: 9
+- Failed: 6
+- Success Rate: 60.00%
+- Average Duration: 2733ms (2.73s)

 ## Failed Tests

-### translation - anthropic/claude-sonnet-4
-
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 1317ms (1.32s)
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
- Timestamp: 6/5/2025, 8:55:31 PM
-
-### translation - openai/gpt-4o-mini
-
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 666ms (0.67s)
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
- Timestamp: 6/5/2025, 8:55:32 PM
-
-### translation - deepseek/deepseek-r1:free
-
- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
- Expected: `¡Hola, mundo!`
- Actual: `¡Hola, mundo!`
- Duration: 5397ms (5.40s)
- Reason: Expected ¡Hola, mundo!, but got ¡hola, mundo!
- Timestamp: 6/5/2025, 8:55:37 PM
-
 ### grammar - anthropic/claude-sonnet-4

 - Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
 - Expected: `I went to the store yesterday`
 - Actual: `I went to the store yesterday.`
- Duration: 1722ms (1.72s)
- Reason: Expected I went to the store yesterday, but got i went to the store yesterday.
- Timestamp: 6/5/2025, 8:55:39 PM
+- Duration: 1201ms (1.20s)
+- Reason: Expected I went to the store yesterday, but got I went to the store yesterday.
+- Timestamp: 6/6/2025, 12:30:29 AM

 ### grammar - openai/gpt-4o-mini

 - Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
 - Expected: `I went to the store yesterday`
 - Actual: `"I went to the store yesterday."`
- Duration: 1171ms (1.17s)
- Reason: Expected I went to the store yesterday, but got "i went to the store yesterday."
- Timestamp: 6/5/2025, 8:55:40 PM
+- Duration: 695ms (0.69s)
+- Reason: Expected I went to the store yesterday, but got "I went to the store yesterday."
+- Timestamp: 6/6/2025, 12:30:30 AM

 ### grammar - deepseek/deepseek-r1:free

 - Prompt: `Correct the grammar in: "I goes to the store yesterday". Return only the corrected sentence, no explanation.`
 - Expected: `I went to the store yesterday`
- Actual: `"I went to the store yesterday."`
- Duration: 5199ms (5.20s)
- Reason: Expected I went to the store yesterday, but got "i went to the store yesterday."
- Timestamp: 6/5/2025, 8:55:45 PM
+- Actual: `I went to the store yesterday.`
+- Duration: 3395ms (3.40s)
+- Reason: Expected I went to the store yesterday, but got I went to the store yesterday.
+- Timestamp: 6/6/2025, 12:30:33 AM

 ### summarization - anthropic/claude-sonnet-4

 - Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
 - Expected: `A fox jumps over a dog`
- Actual: `A fox jumps over a dog.`
- Duration: 1820ms (1.82s)
- Reason: Expected A fox jumps over a dog, but got a fox jumps over a dog.
- Timestamp: 6/5/2025, 8:55:47 PM
+- Actual: `A brown fox leaps over a dog.`
+- Duration: 1304ms (1.30s)
+- Reason: Expected A fox jumps over a dog, but got A brown fox leaps over a dog.
+- Timestamp: 6/6/2025, 12:30:34 AM

 ### summarization - openai/gpt-4o-mini

 - Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
 - Expected: `A fox jumps over a dog`
 - Actual: `A fox jumps over a dog.`
- Duration: 699ms (0.70s)
- Reason: Expected A fox jumps over a dog, but got a fox jumps over a dog.
- Timestamp: 6/5/2025, 8:55:48 PM
+- Duration: 692ms (0.69s)
+- Reason: Expected A fox jumps over a dog, but got A fox jumps over a dog.
+- Timestamp: 6/6/2025, 12:30:35 AM

 ### summarization - deepseek/deepseek-r1:free

 - Prompt: `Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.`
 - Expected: `A fox jumps over a dog`
- Actual: `A quick brown fox leaps over a dog.`
- Duration: 7380ms (7.38s)
- Reason: Expected A fox jumps over a dog, but got a quick brown fox leaps over a dog.
- Timestamp: 6/5/2025, 8:55:55 PM
+- Actual: `"A quick brown fox leaps over a dog."`
+- Duration: 14038ms (14.04s)
+- Reason: Expected A fox jumps over a dog, but got "A quick brown fox leaps over a dog."
+- Timestamp: 6/6/2025, 12:30:49 AM
+
+## Passed Tests
+
+### translation - anthropic/claude-sonnet-4
+
+- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
+- Expected: `¡Hola, mundo!`
+- Actual: `¡Hola, mundo!`
+- Duration: 1560ms (1.56s)
+- Timestamp: 6/6/2025, 12:30:21 AM
+
+### translation - openai/gpt-4o-mini
+
+- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
+- Expected: `¡Hola, mundo!`
+- Actual: `¡Hola, mundo!`
+- Duration: 1451ms (1.45s)
+- Timestamp: 6/6/2025, 12:30:22 AM
+
+### translation - deepseek/deepseek-r1:free
+
+- Prompt: `Translate "Hello, world!" to Spanish. Return only the translation, no explanation.`
+- Expected: `¡Hola, mundo!`
+- Actual: `¡Hola, mundo!`
+- Duration: 5434ms (5.43s)
+- Timestamp: 6/6/2025, 12:30:28 AM

 ### language_detection - anthropic/claude-sonnet-4

 - Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
 - Expected: `French`
 - Actual: `French`
- Duration: 1725ms (1.73s)
- Reason: Expected French, but got french
- Timestamp: 6/5/2025, 8:55:57 PM
+- Duration: 1137ms (1.14s)
+- Timestamp: 6/6/2025, 12:30:50 AM

 ### language_detection - openai/gpt-4o-mini

 - Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
 - Expected: `French`
 - Actual: `French`
- Duration: 776ms (0.78s)
- Reason: Expected French, but got french
- Timestamp: 6/5/2025, 8:55:58 PM
+- Duration: 459ms (0.46s)
+- Timestamp: 6/6/2025, 12:30:51 AM

 ### language_detection - deepseek/deepseek-r1:free

 - Prompt: `Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.`
 - Expected: `French`
 - Actual: `French`
- Duration: 5247ms (5.25s)
- Reason: Expected French, but got french
- Timestamp: 6/5/2025, 8:56:03 PM
+- Duration: 3924ms (3.92s)
+- Timestamp: 6/6/2025, 12:30:55 AM

 ### synonyms - anthropic/claude-sonnet-4

 - Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
 - Expected: `joyful`
- Actual: `Content`
- Duration: 1967ms (1.97s)
- Reason: Expected joyful, but got content
- Timestamp: 6/5/2025, 8:56:05 PM
-
-## Passed Tests
+- Actual: `Joyful`
+- Duration: 1251ms (1.25s)
+- Timestamp: 6/6/2025, 12:30:56 AM

 ### synonyms - openai/gpt-4o-mini

 - Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
 - Expected: `joyful`
 - Actual: `Joyful`
- Duration: 548ms (0.55s)
- Timestamp: 6/5/2025, 8:56:06 PM
+- Duration: 622ms (0.62s)
+- Timestamp: 6/6/2025, 12:30:57 AM

 ### synonyms - deepseek/deepseek-r1:free

 - Prompt: `Provide a synonym for "happy". Return only the synonym, no explanation.`
 - Expected: `joyful`
- Actual: `Joyful`
- Duration: 2616ms (2.62s)
- Timestamp: 6/5/2025, 8:56:08 PM
+- Actual: `joyful`
+- Duration: 3836ms (3.84s)
+- Timestamp: 6/6/2025, 12:31:00 AM