From c5eadead317aaca6ed46c4df86e4b66ed523ab09 Mon Sep 17 00:00:00 2001 From: babayaga Date: Fri, 6 Jun 2025 00:53:58 +0200 Subject: [PATCH] kbot llm eq cmp :): --- packages/kbot/logs/params.json | 2 +- packages/kbot/tests/unit/commons.ts | 57 ++- packages/kbot/tests/unit/language.test.ts | 31 +- packages/kbot/tests/unit/reports/all.json | 348 +++++++++++++++++- .../kbot/tests/unit/reports/language.json | 348 +++++++++++++++++- 5 files changed, 744 insertions(+), 42 deletions(-) diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json index 91b5fcf8..6633a7f0 100644 --- a/packages/kbot/logs/params.json +++ b/packages/kbot/logs/params.json @@ -1,5 +1,5 @@ { - "model": "deepseek/deepseek-r1:free", + "model": "anthropic/claude-sonnet-4", "messages": [ { "role": "user", diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts index 89289960..1a819516 100644 --- a/packages/kbot/tests/unit/commons.ts +++ b/packages/kbot/tests/unit/commons.ts @@ -24,7 +24,7 @@ export enum EqualityCheck { NONE = 'none' } -export type EqualityFn = (actual: string, expected: string) => Promise; +export type EqualityFn = (actual: string, expected: string, prompt?: string) => Promise; export const EQUALITY_CHECKS: Record = { [EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise => { @@ -40,6 +40,55 @@ export const EQUALITY_CHECKS: Record = { return false; } }, + [EqualityCheck.LLM_EQUAL]: async (actual: string, expected: string, prompt?: string): Promise => { + try { + const model = getTestEqualModels()[0]; + if (!model) { + return false; + } + const comparisonPrompt = `You are an assistant that judges if two AI responses are semantically equivalent for a given prompt. +The original prompt was: "${prompt}" + +Response A: +"${actual}" +Response B (expected): +"${expected}" +Are these two responses semantically equivalent? Consider that minor differences in formatting (like commas, casing) or phrasing should be ignored as long as the meaning is the same. Dont comment, just return the JSON object.`; + const responseSchema = { + type: 'object', + properties: { + equivalent: { + type: 'boolean', + description: "Whether the two responses are semantically equivalent." + }, + semantic_distance: { + type: 'number', + description: "The semantic distance between the two responses, on a scale of 0 to 100." + } + }, + required: ['equivalent', 'semantic_distance'] + }; + + const zodSchema = JSONSchemaToZod.convert(responseSchema); + const format = zodResponseFormat(zodSchema, 'equivalent'); + + const result = await run({ + prompt: comparisonPrompt, + model, + mode: 'completion', + format + }); + + if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) { + return false; + } + + const parsedResult = JSON.parse(result[0] as string); + return parsedResult.equivalent === true; + } catch (e) { + return false; + } + }, [EqualityCheck.NONE]: async (): Promise => { return true; }, @@ -55,7 +104,7 @@ export const getFastModels = (): string[] => { export const getTestEqualModels = (): string[] => { return [ - E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE + E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O ] } @@ -248,7 +297,7 @@ export const runTest = async ( ) ]) as string[] - if (isEmptyResponse(result)) { + if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) { testResult = { test: testName, prompt, @@ -265,7 +314,7 @@ export const runTest = async ( const actual = result?.[0] || '' const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT] - const passed = await checkFn(actual, expected) + const passed = await checkFn(actual, expected, prompt) testResult = { test: testName, diff --git a/packages/kbot/tests/unit/language.test.ts b/packages/kbot/tests/unit/language.test.ts index a2374b53..1d1e245e 100644 --- a/packages/kbot/tests/unit/language.test.ts +++ b/packages/kbot/tests/unit/language.test.ts @@ -10,7 +10,8 @@ import { TestResult, runTest, generateTestReport, - getReportPaths + getReportPaths, + EqualityCheck } from './commons' // Optionally override models for this specific test file @@ -27,6 +28,22 @@ describe('Language Operations', () => { const TEST_LOG_PATH = getReportPaths('language', 'json') const TEST_REPORT_PATH = getReportPaths('language', 'md') + + it.each(models)('should summarize text with model %s', async (modelName) => { + const result = await runTest( + 'Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.', + 'A fox jumps over a dog', + 'summarization', + modelName, + TEST_LOG_PATH, + 'completion', + { equalityCheck: EqualityCheck.LLM_EQUAL } + ) + testResults.push(result) + expect(result.passed).toBe(true) + }, { timeout: TEST_TIMEOUT }) + + it.each(models)('should translate text with model %s', async (modelName) => { const result = await runTest( 'Translate "Hello, world!" to Spanish. Return only the translation, no explanation.', @@ -51,18 +68,6 @@ describe('Language Operations', () => { expect(normalize(result.result[0])).toEqual(normalize('i went to the store yesterday')) }, { timeout: TEST_TIMEOUT }) - it.each(models)('should summarize text with model %s', async (modelName) => { - const result = await runTest( - 'Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.', - 'A fox jumps over a dog', - 'summarization', - modelName, - TEST_LOG_PATH - ) - testResults.push(result) - expect(normalize(result.result[0])).toEqual(normalize('a fox jumps over a dog')) - }, { timeout: TEST_TIMEOUT }) - it.each(models)('should identify language with model %s', async (modelName) => { const result = await runTest( 'Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.', diff --git a/packages/kbot/tests/unit/reports/all.json b/packages/kbot/tests/unit/reports/all.json index 0bc3eec1..acde51dc 100644 --- a/packages/kbot/tests/unit/reports/all.json +++ b/packages/kbot/tests/unit/reports/all.json @@ -4757,6 +4757,330 @@ "passed": true, "duration": 3836, "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:41:10.838Z", + "passed": true, + "duration": 1465, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:41:11.649Z", + "passed": true, + "duration": 805, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:41:16.717Z", + "passed": true, + "duration": 5063, + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:41:17.368Z", + "passed": false, + "duration": 646, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "\"I went to the store yesterday.\"" + ], + "expected": "I went to the store yesterday", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:41:18.259Z", + "passed": false, + "duration": 886, + "reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:41:23.006Z", + "passed": false, + "duration": 4742, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A brown fox leaps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:41:32.126Z", + "passed": false, + "duration": 9115, + "reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.", + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A brown fox jumps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:44:29.781Z", + "passed": false, + "duration": 6689, + "reason": "Expected A fox jumps over a dog, but got A brown fox jumps over a dog.", + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A fox jumps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:45:17.372Z", + "passed": true, + "duration": 47581, + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A fox jumps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:52:22.430Z", + "passed": true, + "duration": 27328, + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A fox leaps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:52:30.984Z", + "passed": true, + "duration": 8548, + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [], + "expected": "A fox jumps over a dog", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:31.003Z", + "passed": false, + "duration": 60014, + "error": { + "message": "API call timed out", + "code": "UNKNOWN", + "type": "Error", + "details": { + "stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:296:33)\n at listOnTimeout (node:internal/timers:588:17)\n at processTimers (node:internal/timers:523:7)", + "message": "API call timed out" + } + }, + "reason": "API call timed out", + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:32.418Z", + "passed": true, + "duration": 1408, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:53:33.063Z", + "passed": true, + "duration": 639, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:35.309Z", + "passed": true, + "duration": 2241, + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:35.959Z", + "passed": false, + "duration": 645, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "\"I went to the store yesterday.\"" + ], + "expected": "I went to the store yesterday", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:53:36.606Z", + "passed": false, + "duration": 641, + "reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:41.701Z", + "passed": false, + "duration": 5090, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "language_detection", + "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.", + "result": [ + "French" + ], + "expected": "French", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:42.454Z", + "passed": true, + "duration": 747, + "category": "language" + }, + { + "test": "language_detection", + "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.", + "result": [ + "French" + ], + "expected": "French", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:53:43.116Z", + "passed": true, + "duration": 657, + "category": "language" + }, + { + "test": "language_detection", + "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.", + "result": [ + "French" + ], + "expected": "French", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:47.420Z", + "passed": true, + "duration": 4299, + "category": "language" + }, + { + "test": "synonyms", + "prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.", + "result": [ + "Joyful" + ], + "expected": "joyful", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:48.762Z", + "passed": true, + "duration": 1336, + "category": "language" } ], "highscores": [ @@ -4898,15 +5222,15 @@ { "test": "translation", "rankings": [ + { + "model": "openai/gpt-4o-mini", + "duration": 639, + "duration_secs": 0.639 + }, { "model": "openai/gpt-3.5-turbo", "duration": 818, "duration_secs": 0.818 - }, - { - "model": "openai/gpt-4o-mini", - "duration": 1451, - "duration_secs": 1.451 } ] }, @@ -4920,8 +5244,8 @@ }, { "model": "openai/gpt-4o-mini", - "duration": 695, - "duration_secs": 0.695 + "duration": 641, + "duration_secs": 0.641 } ] }, @@ -4935,8 +5259,8 @@ }, { "model": "openai/gpt-4o-mini", - "duration": 692, - "duration_secs": 0.692 + "duration": 8548, + "duration_secs": 8.548 } ] }, @@ -4945,8 +5269,8 @@ "rankings": [ { "model": "openai/gpt-4o-mini", - "duration": 459, - "duration_secs": 0.459 + "duration": 657, + "duration_secs": 0.657 }, { "model": "openai/gpt-3.5-turbo", @@ -5171,5 +5495,5 @@ ] } ], - "lastUpdated": "2025-06-05T22:31:00.927Z" + "lastUpdated": "2025-06-05T22:53:48.765Z" } \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/language.json b/packages/kbot/tests/unit/reports/language.json index fee85023..4febcef2 100644 --- a/packages/kbot/tests/unit/reports/language.json +++ b/packages/kbot/tests/unit/reports/language.json @@ -1667,21 +1667,345 @@ "passed": true, "duration": 3836, "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:41:10.838Z", + "passed": true, + "duration": 1465, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:41:11.649Z", + "passed": true, + "duration": 805, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:41:16.717Z", + "passed": true, + "duration": 5063, + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:41:17.368Z", + "passed": false, + "duration": 646, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "\"I went to the store yesterday.\"" + ], + "expected": "I went to the store yesterday", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:41:18.259Z", + "passed": false, + "duration": 886, + "reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:41:23.006Z", + "passed": false, + "duration": 4742, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A brown fox leaps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:41:32.126Z", + "passed": false, + "duration": 9115, + "reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.", + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A brown fox jumps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:44:29.781Z", + "passed": false, + "duration": 6689, + "reason": "Expected A fox jumps over a dog, but got A brown fox jumps over a dog.", + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A fox jumps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:45:17.372Z", + "passed": true, + "duration": 47581, + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A fox jumps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:52:22.430Z", + "passed": true, + "duration": 27328, + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [ + "A fox leaps over a dog." + ], + "expected": "A fox jumps over a dog", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:52:30.984Z", + "passed": true, + "duration": 8548, + "category": "language" + }, + { + "test": "summarization", + "prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.", + "result": [], + "expected": "A fox jumps over a dog", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:31.003Z", + "passed": false, + "duration": 60014, + "error": { + "message": "API call timed out", + "code": "UNKNOWN", + "type": "Error", + "details": { + "stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:296:33)\n at listOnTimeout (node:internal/timers:588:17)\n at processTimers (node:internal/timers:523:7)", + "message": "API call timed out" + } + }, + "reason": "API call timed out", + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:32.418Z", + "passed": true, + "duration": 1408, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:53:33.063Z", + "passed": true, + "duration": 639, + "category": "language" + }, + { + "test": "translation", + "prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.", + "result": [ + "¡Hola, mundo!" + ], + "expected": "¡Hola, mundo!", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:35.309Z", + "passed": true, + "duration": 2241, + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:35.959Z", + "passed": false, + "duration": 645, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "\"I went to the store yesterday.\"" + ], + "expected": "I went to the store yesterday", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:53:36.606Z", + "passed": false, + "duration": 641, + "reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"", + "category": "language" + }, + { + "test": "grammar", + "prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.", + "result": [ + "I went to the store yesterday." + ], + "expected": "I went to the store yesterday", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:41.701Z", + "passed": false, + "duration": 5090, + "reason": "Expected I went to the store yesterday, but got I went to the store yesterday.", + "category": "language" + }, + { + "test": "language_detection", + "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.", + "result": [ + "French" + ], + "expected": "French", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:42.454Z", + "passed": true, + "duration": 747, + "category": "language" + }, + { + "test": "language_detection", + "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.", + "result": [ + "French" + ], + "expected": "French", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-06-05T22:53:43.116Z", + "passed": true, + "duration": 657, + "category": "language" + }, + { + "test": "language_detection", + "prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.", + "result": [ + "French" + ], + "expected": "French", + "model": "deepseek/deepseek-r1:free", + "router": "deepseek/deepseek-r1:free", + "timestamp": "2025-06-05T22:53:47.420Z", + "passed": true, + "duration": 4299, + "category": "language" + }, + { + "test": "synonyms", + "prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.", + "result": [ + "Joyful" + ], + "expected": "joyful", + "model": "anthropic/claude-sonnet-4", + "router": "anthropic/claude-sonnet-4", + "timestamp": "2025-06-05T22:53:48.762Z", + "passed": true, + "duration": 1336, + "category": "language" } ], "highscores": [ { "test": "translation", "rankings": [ + { + "model": "openai/gpt-4o-mini", + "duration": 639, + "duration_secs": 0.639 + }, { "model": "openai/gpt-3.5-turbo", "duration": 818, "duration_secs": 0.818 - }, - { - "model": "openai/gpt-4o-mini", - "duration": 1451, - "duration_secs": 1.451 } ] }, @@ -1695,8 +2019,8 @@ }, { "model": "openai/gpt-4o-mini", - "duration": 695, - "duration_secs": 0.695 + "duration": 641, + "duration_secs": 0.641 } ] }, @@ -1710,8 +2034,8 @@ }, { "model": "openai/gpt-4o-mini", - "duration": 692, - "duration_secs": 0.692 + "duration": 8548, + "duration_secs": 8.548 } ] }, @@ -1720,8 +2044,8 @@ "rankings": [ { "model": "openai/gpt-4o-mini", - "duration": 459, - "duration_secs": 0.459 + "duration": 657, + "duration_secs": 0.657 }, { "model": "openai/gpt-3.5-turbo", @@ -1746,5 +2070,5 @@ ] } ], - "lastUpdated": "2025-06-05T22:31:00.924Z" + "lastUpdated": "2025-06-05T22:53:48.763Z" } \ No newline at end of file