diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json index 605ca4cd..f9606554 100644 --- a/packages/kbot/logs/params.json +++ b/packages/kbot/logs/params.json @@ -1,5 +1,5 @@ { - "model": "qwen/qwq-32b", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", "messages": [ { "role": "user", diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts index 7158d815..c80790ba 100644 --- a/packages/kbot/tests/unit/commons.ts +++ b/packages/kbot/tests/unit/commons.ts @@ -9,7 +9,11 @@ import { sync as mkdirp } from "mkdirp" export const getDefaultModels = () => [ //E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE, E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET, - E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B + E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B, + E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI, + E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO, + E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1, + E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE ] export const isOpenRouterModel = (model: string): boolean => { @@ -51,6 +55,7 @@ export interface TestResult { details?: any; }; duration?: number + category?: string; } export interface TestHighscore { @@ -108,7 +113,7 @@ export const runTest = async ( logPath: string ): Promise => { let model = 'unknown' - let router = 'unknown' + let router = 'openrouter' let startTime = Date.now() let error: TestResult['error'] | undefined let testResult: TestResult | undefined @@ -124,7 +129,7 @@ export const runTest = async ( preferences: TEST_PREFERENCES_PATH, onRun: async (options) => { model = options.model || 'unknown' - router = isOpenRouterModel(model) ? 'openrouter' : 'unknown' + router = options.model as string return options } }), @@ -181,6 +186,13 @@ export const runTest = async ( throw e } finally { if (testResult) { + // Extract category from logPath (e.g., 'reports/basic.json' -> 'basic') + const category = path.basename(logPath, path.extname(logPath)) + + // Add category to test result + testResult.category = category + + // Update category-specific log const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] } const updatedResults = [...(existingData.results || []), testResult] @@ -200,12 +212,40 @@ export const runTest = async ( // Generate highscores const highscores = generateHighscores(latestResults) - // Write both results and highscores + // Write category-specific results write(logPath, JSON.stringify({ results: updatedResults, highscores, lastUpdated: new Date().toISOString() }, null, 2)) + + // Update central all.json log + const allLogPath = path.resolve(REPORTS_DIR, 'all.json') + const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] } + const allUpdatedResults = [...(allExistingData.results || []), testResult] + + // Group all results by test and model + const allLatestResults = new Map>() + allUpdatedResults.forEach(result => { + if (!allLatestResults.has(result.test)) { + allLatestResults.set(result.test, new Map()) + } + const testMap = allLatestResults.get(result.test)! + const existingResult = testMap.get(result.model) + if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { + testMap.set(result.model, result) + } + }) + + // Generate highscores for all results + const allHighscores = generateHighscores(allLatestResults) + + // Write all results + write(allLogPath, JSON.stringify({ + results: allUpdatedResults, + highscores: allHighscores, + lastUpdated: new Date().toISOString() + }, null, 2)) } } return testResult @@ -236,25 +276,29 @@ export const generateTestReport = ( // Add highscore section report += '## Highscores\n\n' - for (const [testName, modelResults] of latestResults) { - report += `### ${testName}\n` - - // Convert model results to array and sort by duration - const sortedResults = Array.from(modelResults.entries()) - .map(([model, result]) => ({ model, result })) - .sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0)) - .slice(0, 2) // Get top 2 + + // Create a table header + report += '| Test | Model | Duration (ms) | Duration (s) |\n' + report += '|------|-------|--------------|--------------|\n' + + // Sort all results by duration + const allResults = Array.from(latestResults.entries()) + .flatMap(([testName, modelResults]) => + Array.from(modelResults.entries()) + .map(([model, result]) => ({ + test: testName, + model, + duration: result.duration || 0 + })) + ) + .sort((a, b) => a.duration - b.duration) - if (sortedResults.length > 0) { - sortedResults.forEach(({ model, result }, index) => { - const duration = result.duration || 0 - report += `${index + 1}. ${model}: ${duration}ms (${(duration / 1000).toFixed(2)}s)\n` - }) - } else { - report += '*No results available*\n' - } - report += '\n' - } + // Add all results to the table + allResults.forEach(({ test, model, duration }) => { + report += `| ${test} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n` + }) + + report += '\n' // Add summary section report += '## Summary\n\n' @@ -275,7 +319,7 @@ export const generateTestReport = ( for (const [model, result] of modelResults) { if (!result.passed) { hasFailures = true - report += `### ${testName} - ${model}\n` + report += `### ${testName} - ${model}\n\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` @@ -305,7 +349,7 @@ export const generateTestReport = ( for (const [model, result] of modelResults) { if (result.passed) { hasPassed = true - report += `### ${testName} - ${model}\n` + report += `### ${testName} - ${model}\n\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` diff --git a/packages/kbot/tests/unit/reports/all.json b/packages/kbot/tests/unit/reports/all.json new file mode 100644 index 00000000..10c9bb52 --- /dev/null +++ b/packages/kbot/tests/unit/reports/all.json @@ -0,0 +1,1202 @@ +{ + "results": [ + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:10.276Z", + "passed": false, + "duration": 1242, + "reason": "Expected -3,-2, but got -2,-3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:31.650Z", + "passed": false, + "duration": 21368, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:34.699Z", + "passed": true, + "duration": 3046, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:45.957Z", + "passed": true, + "duration": 11256, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:47.935Z", + "passed": true, + "duration": 1976, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:07.714Z", + "passed": false, + "duration": 19778, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:08.883Z", + "passed": true, + "duration": 1167, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:12.225Z", + "passed": true, + "duration": 3341, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:12.889Z", + "passed": true, + "duration": 663, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:32.527Z", + "passed": true, + "duration": 19636, + "category": "math" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:21.370Z", + "passed": true, + "duration": 1213, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:24.898Z", + "passed": true, + "duration": 3524, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:25.624Z", + "passed": true, + "duration": 724, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:26.630Z", + "passed": true, + "duration": 1005, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:27.812Z", + "passed": true, + "duration": 1178, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:31.317Z", + "passed": true, + "duration": 3503, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:32.288Z", + "passed": true, + "duration": 969, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:33.147Z", + "passed": true, + "duration": 858, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:33.724Z", + "passed": true, + "duration": 576, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:34.841Z", + "passed": true, + "duration": 1115, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:35.673Z", + "passed": true, + "duration": 831, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:36.762Z", + "passed": true, + "duration": 1087, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:25.749Z", + "passed": true, + "duration": 1644, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:31.261Z", + "passed": true, + "duration": 5507, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:32.131Z", + "passed": true, + "duration": 869, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:33.306Z", + "passed": true, + "duration": 1173, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:34.323Z", + "passed": true, + "duration": 1016, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:38.976Z", + "passed": true, + "duration": 4651, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:39.914Z", + "passed": true, + "duration": 937, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:41.053Z", + "passed": true, + "duration": 1137, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:42.918Z", + "passed": true, + "duration": 1863, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:47.234Z", + "passed": true, + "duration": 4314, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:47.966Z", + "passed": true, + "duration": 730, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:48.941Z", + "passed": true, + "duration": 973, + "category": "basic" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:10.419Z", + "passed": false, + "duration": 1650, + "reason": "Expected -3,-2, but got -2,-3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-3,-2" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:20.647Z", + "passed": true, + "duration": 10222, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:21.643Z", + "passed": false, + "duration": 994, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:22.524Z", + "passed": false, + "duration": 878, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:23.496Z", + "passed": true, + "duration": 970, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:28.452Z", + "passed": true, + "duration": 4954, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:29.325Z", + "passed": true, + "duration": 872, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:30.109Z", + "passed": true, + "duration": 782, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:33.902Z", + "passed": true, + "duration": 3791, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:46.225Z", + "passed": false, + "duration": 12322, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:47.202Z", + "passed": false, + "duration": 974, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:48.005Z", + "passed": true, + "duration": 800, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:48.763Z", + "passed": true, + "duration": 756, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:55.510Z", + "passed": true, + "duration": 6745, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:56.297Z", + "passed": true, + "duration": 785, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:57.051Z", + "passed": true, + "duration": 751, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:58.294Z", + "passed": true, + "duration": 1241, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:23:04.551Z", + "passed": false, + "duration": 6255, + "reason": "Model returned empty response", + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:23:05.297Z", + "passed": true, + "duration": 743, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:23:06.018Z", + "passed": true, + "duration": 719, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:32.237Z", + "passed": false, + "duration": 1533, + "reason": "Expected -3,-2, but got -2,-3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-3, -2" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:50.178Z", + "passed": false, + "duration": 17934, + "reason": "Expected -3,-2, but got -3, -2", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:51.040Z", + "passed": false, + "duration": 859, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:51.938Z", + "passed": false, + "duration": 895, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:15.745Z", + "passed": true, + "duration": 1951, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:19.476Z", + "passed": true, + "duration": 3726, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:20.854Z", + "passed": true, + "duration": 1376, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:28.044Z", + "passed": true, + "duration": 7188, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:43.203Z", + "passed": true, + "duration": 15157, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:50.736Z", + "passed": true, + "duration": 7531, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:51.834Z", + "passed": true, + "duration": 1096, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:55.428Z", + "passed": true, + "duration": 3592, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:56.874Z", + "passed": true, + "duration": 1444, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:57.746Z", + "passed": true, + "duration": 870, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-r1", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:08.731Z", + "passed": true, + "duration": 10983, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:14.379Z", + "passed": true, + "duration": 5646, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:15.658Z", + "passed": true, + "duration": 1276, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:21.428Z", + "passed": true, + "duration": 5768, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:22.358Z", + "passed": true, + "duration": 929, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:23.155Z", + "passed": true, + "duration": 794, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-r1", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:38.566Z", + "passed": true, + "duration": 15409, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:40.358Z", + "passed": true, + "duration": 1790, + "category": "basic" + } + ], + "highscores": [ + { + "test": "quadratic", + "rankings": [ + { + "model": "openai/gpt-4o-mini", + "duration": 859, + "duration_secs": 0.859 + }, + { + "model": "openai/gpt-3.5-turbo", + "duration": 895, + "duration_secs": 0.895 + } + ] + }, + { + "test": "factorial", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 782, + "duration_secs": 0.782 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 872, + "duration_secs": 0.872 + } + ] + }, + { + "test": "fibonacci", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 800, + "duration_secs": 0.8 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 974, + "duration_secs": 0.974 + } + ] + }, + { + "test": "square_root", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 751, + "duration_secs": 0.751 + }, + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 756, + "duration_secs": 0.756 + } + ] + }, + { + "test": "power", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 719, + "duration_secs": 0.719 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 743, + "duration_secs": 0.743 + } + ] + }, + { + "test": "addition", + "rankings": [ + { + "model": "openai/gpt-4o-mini", + "duration": 1376, + "duration_secs": 1.376 + }, + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 1951, + "duration_secs": 1.951 + } + ] + }, + { + "test": "multiplication", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 870, + "duration_secs": 0.87 + }, + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 1096, + "duration_secs": 1.096 + } + ] + }, + { + "test": "division", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 794, + "duration_secs": 0.794 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 929, + "duration_secs": 0.929 + } + ] + } + ], + "lastUpdated": "2025-04-02T11:26:40.359Z" +} \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/basic.json b/packages/kbot/tests/unit/reports/basic.json index 8c6dc168..8e197807 100644 --- a/packages/kbot/tests/unit/reports/basic.json +++ b/packages/kbot/tests/unit/reports/basic.json @@ -155,6 +155,594 @@ "timestamp": "2025-04-02T11:01:27.791Z", "passed": true, "duration": 7852 + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:21.370Z", + "passed": true, + "duration": 1213, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:24.898Z", + "passed": true, + "duration": 3524, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:25.624Z", + "passed": true, + "duration": 724, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:26.630Z", + "passed": true, + "duration": 1005, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:27.812Z", + "passed": true, + "duration": 1178, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:31.317Z", + "passed": true, + "duration": 3503, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:32.288Z", + "passed": true, + "duration": 969, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:33.147Z", + "passed": true, + "duration": 858, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:33.724Z", + "passed": true, + "duration": 576, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:34.841Z", + "passed": true, + "duration": 1115, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:35.673Z", + "passed": true, + "duration": 831, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:16:36.762Z", + "passed": true, + "duration": 1087, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:25.749Z", + "passed": true, + "duration": 1644, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:31.261Z", + "passed": true, + "duration": 5507, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:32.131Z", + "passed": true, + "duration": 869, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:33.306Z", + "passed": true, + "duration": 1173, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:34.323Z", + "passed": true, + "duration": 1016, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:38.976Z", + "passed": true, + "duration": 4651, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:39.914Z", + "passed": true, + "duration": 937, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:41.053Z", + "passed": true, + "duration": 1137, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:42.918Z", + "passed": true, + "duration": 1863, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:47.234Z", + "passed": true, + "duration": 4314, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:47.966Z", + "passed": true, + "duration": 730, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:20:48.941Z", + "passed": true, + "duration": 973, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:15.745Z", + "passed": true, + "duration": 1951, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:19.476Z", + "passed": true, + "duration": 3726, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:20.854Z", + "passed": true, + "duration": 1376, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:28.044Z", + "passed": true, + "duration": 7188, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:43.203Z", + "passed": true, + "duration": 15157, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:50.736Z", + "passed": true, + "duration": 7531, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:51.834Z", + "passed": true, + "duration": 1096, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:55.428Z", + "passed": true, + "duration": 3592, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:56.874Z", + "passed": true, + "duration": 1444, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:25:57.746Z", + "passed": true, + "duration": 870, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-r1", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:08.731Z", + "passed": true, + "duration": 10983, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:14.379Z", + "passed": true, + "duration": 5646, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:15.658Z", + "passed": true, + "duration": 1276, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:21.428Z", + "passed": true, + "duration": 5768, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:22.358Z", + "passed": true, + "duration": 929, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:23.155Z", + "passed": true, + "duration": 794, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-r1", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:38.566Z", + "passed": true, + "duration": 15409, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "openrouter", + "timestamp": "2025-04-02T11:26:40.358Z", + "passed": true, + "duration": 1790, + "category": "basic" } ], "highscores": [ @@ -162,14 +750,14 @@ "test": "addition", "rankings": [ { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1888, - "duration_secs": 1.888 + "model": "openai/gpt-4o-mini", + "duration": 1376, + "duration_secs": 1.376 }, { - "model": "qwen/qwq-32b", - "duration": 6304, - "duration_secs": 6.304 + "model": "anthropic/claude-3.5-sonnet", + "duration": 1951, + "duration_secs": 1.951 } ] }, @@ -177,14 +765,14 @@ "test": "multiplication", "rankings": [ { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1291, - "duration_secs": 1.291 + "model": "openai/gpt-3.5-turbo", + "duration": 870, + "duration_secs": 0.87 }, { - "model": "qwen/qwq-32b", - "duration": 2225, - "duration_secs": 2.225 + "model": "anthropic/claude-3.5-sonnet", + "duration": 1096, + "duration_secs": 1.096 } ] }, @@ -192,17 +780,17 @@ "test": "division", "rankings": [ { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1209, - "duration_secs": 1.209 + "model": "openai/gpt-3.5-turbo", + "duration": 794, + "duration_secs": 0.794 }, { - "model": "qwen/qwq-32b", - "duration": 7852, - "duration_secs": 7.852 + "model": "openai/gpt-4o-mini", + "duration": 929, + "duration_secs": 0.929 } ] } ], - "lastUpdated": "2025-04-02T11:01:27.792Z" + "lastUpdated": "2025-04-02T11:26:40.358Z" } \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/basic.markdown b/packages/kbot/tests/unit/reports/basic.markdown deleted file mode 100644 index 6af71a0a..00000000 --- a/packages/kbot/tests/unit/reports/basic.markdown +++ /dev/null @@ -1,72 +0,0 @@ -# Basic Operations Test Results - -## Highscores - -### addition -1. anthropic/claude-3.5-sonnet: 1237ms (1.24s) -2. qwen/qwq-32b: 4298ms (4.30s) - -### multiplication -1. anthropic/claude-3.5-sonnet: 1411ms (1.41s) -2. qwen/qwq-32b: 3122ms (3.12s) - -### division -1. anthropic/claude-3.5-sonnet: 583ms (0.58s) -2. qwen/qwq-32b: 3615ms (3.62s) - -## Summary - -- Total Tests: 6 -- Passed: 6 -- Failed: 0 -- Success Rate: 100.00% -- Average Duration: 2378ms (2.38s) - -## Failed Tests - -*No failed tests* - -## Passed Tests - -### addition - anthropic/claude-3.5-sonnet -- Prompt: `add 5 and 3. Return only the number, no explanation.` -- Expected: `8` -- Actual: `8` -- Duration: 1237ms (1237.00s) -- Timestamp: 4/2/2025, 12:56:09 PM - -### addition - qwen/qwq-32b -- Prompt: `add 5 and 3. Return only the number, no explanation.` -- Expected: `8` -- Actual: `8` -- Duration: 4298ms (4298.00s) -- Timestamp: 4/2/2025, 12:56:13 PM - -### multiplication - anthropic/claude-3.5-sonnet -- Prompt: `multiply 8 and 3. Return only the number, no explanation.` -- Expected: `24` -- Actual: `24` -- Duration: 1411ms (1411.00s) -- Timestamp: 4/2/2025, 12:56:15 PM - -### multiplication - qwen/qwq-32b -- Prompt: `multiply 8 and 3. Return only the number, no explanation.` -- Expected: `24` -- Actual: `24` -- Duration: 3122ms (3122.00s) -- Timestamp: 4/2/2025, 12:56:18 PM - -### division - anthropic/claude-3.5-sonnet -- Prompt: `divide 15 by 3. Return only the number, no explanation.` -- Expected: `5` -- Actual: `5` -- Duration: 583ms (583.00s) -- Timestamp: 4/2/2025, 12:56:18 PM - -### division - qwen/qwq-32b -- Prompt: `divide 15 by 3. Return only the number, no explanation.` -- Expected: `5` -- Actual: `5` -- Duration: 3615ms (3615.00s) -- Timestamp: 4/2/2025, 12:56:22 PM - diff --git a/packages/kbot/tests/unit/reports/basic.md b/packages/kbot/tests/unit/reports/basic.md index fba3ef39..ee9d010c 100644 --- a/packages/kbot/tests/unit/reports/basic.md +++ b/packages/kbot/tests/unit/reports/basic.md @@ -2,25 +2,34 @@ ## Highscores -### addition -1. anthropic/claude-3.5-sonnet: 1888ms (1.89s) -2. qwen/qwq-32b: 6304ms (6.30s) - -### multiplication -1. anthropic/claude-3.5-sonnet: 1291ms (1.29s) -2. qwen/qwq-32b: 2225ms (2.23s) - -### division -1. anthropic/claude-3.5-sonnet: 1209ms (1.21s) -2. qwen/qwq-32b: 7852ms (7.85s) +| Test | Model | Duration (ms) | Duration (s) | +|------|-------|--------------|--------------| +| division | openai/gpt-3.5-turbo | 794 | 0.79 | +| multiplication | openai/gpt-3.5-turbo | 870 | 0.87 | +| division | openai/gpt-4o-mini | 929 | 0.93 | +| multiplication | anthropic/claude-3.5-sonnet | 1096 | 1.10 | +| division | anthropic/claude-3.5-sonnet | 1276 | 1.28 | +| addition | openai/gpt-4o-mini | 1376 | 1.38 | +| multiplication | openai/gpt-4o-mini | 1444 | 1.44 | +| division | deepseek/deepseek-r1-distill-qwen-14b:free | 1790 | 1.79 | +| addition | anthropic/claude-3.5-sonnet | 1951 | 1.95 | +| multiplication | qwen/qwq-32b | 3592 | 3.59 | +| addition | qwen/qwq-32b | 3726 | 3.73 | +| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 5646 | 5.65 | +| division | qwen/qwq-32b | 5768 | 5.77 | +| addition | openai/gpt-3.5-turbo | 7188 | 7.19 | +| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 7531 | 7.53 | +| multiplication | deepseek/deepseek-r1 | 10983 | 10.98 | +| addition | deepseek/deepseek-r1 | 15157 | 15.16 | +| division | deepseek/deepseek-r1 | 15409 | 15.41 | ## Summary -- Total Tests: 6 -- Passed: 6 +- Total Tests: 18 +- Passed: 18 - Failed: 0 - Success Rate: 100.00% -- Average Duration: 3462ms (3.46s) +- Average Duration: 4807ms (4.81s) ## Failed Tests @@ -29,44 +38,146 @@ ## Passed Tests ### addition - anthropic/claude-3.5-sonnet + - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 1888ms (1888.00s) -- Timestamp: 4/2/2025, 1:01:08 PM +- Duration: 1951ms (1951.00s) +- Timestamp: 4/2/2025, 1:25:15 PM ### addition - qwen/qwq-32b + - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 6304ms (6304.00s) -- Timestamp: 4/2/2025, 1:01:15 PM +- Duration: 3726ms (3726.00s) +- Timestamp: 4/2/2025, 1:25:19 PM + +### addition - openai/gpt-4o-mini + +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 1376ms (1376.00s) +- Timestamp: 4/2/2025, 1:25:20 PM + +### addition - openai/gpt-3.5-turbo + +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 7188ms (7188.00s) +- Timestamp: 4/2/2025, 1:25:28 PM + +### addition - deepseek/deepseek-r1 + +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 15157ms (15157.00s) +- Timestamp: 4/2/2025, 1:25:43 PM + +### addition - deepseek/deepseek-r1-distill-qwen-14b:free + +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 7531ms (7531.00s) +- Timestamp: 4/2/2025, 1:25:50 PM ### multiplication - anthropic/claude-3.5-sonnet + - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Duration: 1291ms (1291.00s) -- Timestamp: 4/2/2025, 1:01:16 PM +- Duration: 1096ms (1096.00s) +- Timestamp: 4/2/2025, 1:25:51 PM ### multiplication - qwen/qwq-32b + - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Duration: 2225ms (2225.00s) -- Timestamp: 4/2/2025, 1:01:18 PM +- Duration: 3592ms (3592.00s) +- Timestamp: 4/2/2025, 1:25:55 PM + +### multiplication - openai/gpt-4o-mini + +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Duration: 1444ms (1444.00s) +- Timestamp: 4/2/2025, 1:25:56 PM + +### multiplication - openai/gpt-3.5-turbo + +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Duration: 870ms (870.00s) +- Timestamp: 4/2/2025, 1:25:57 PM + +### multiplication - deepseek/deepseek-r1 + +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Duration: 10983ms (10983.00s) +- Timestamp: 4/2/2025, 1:26:08 PM + +### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free + +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Duration: 5646ms (5646.00s) +- Timestamp: 4/2/2025, 1:26:14 PM ### division - anthropic/claude-3.5-sonnet + - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Duration: 1209ms (1209.00s) -- Timestamp: 4/2/2025, 1:01:19 PM +- Duration: 1276ms (1276.00s) +- Timestamp: 4/2/2025, 1:26:15 PM ### division - qwen/qwq-32b + - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Duration: 7852ms (7852.00s) -- Timestamp: 4/2/2025, 1:01:27 PM +- Duration: 5768ms (5768.00s) +- Timestamp: 4/2/2025, 1:26:21 PM + +### division - openai/gpt-4o-mini + +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Duration: 929ms (929.00s) +- Timestamp: 4/2/2025, 1:26:22 PM + +### division - openai/gpt-3.5-turbo + +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Duration: 794ms (794.00s) +- Timestamp: 4/2/2025, 1:26:23 PM + +### division - deepseek/deepseek-r1 + +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Duration: 15409ms (15409.00s) +- Timestamp: 4/2/2025, 1:26:38 PM + +### division - deepseek/deepseek-r1-distill-qwen-14b:free + +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Duration: 1790ms (1790.00s) +- Timestamp: 4/2/2025, 1:26:40 PM diff --git a/packages/kbot/tests/unit/reports/math.json b/packages/kbot/tests/unit/reports/math.json new file mode 100644 index 00000000..959ae04e --- /dev/null +++ b/packages/kbot/tests/unit/reports/math.json @@ -0,0 +1,700 @@ +{ + "results": [ + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:09:36.865Z", + "passed": false, + "duration": 1944, + "reason": "Expected -3,-2, but got -2,-3" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-3,-2" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:09:47.481Z", + "passed": true, + "duration": 10608 + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:09:49.153Z", + "passed": true, + "duration": 1671 + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [], + "expected": "120", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:03.043Z", + "passed": false, + "duration": 13889, + "reason": "Model returned empty response" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:03.988Z", + "passed": true, + "duration": 943 + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:05.723Z", + "passed": false, + "duration": 1734, + "reason": "Expected 8, but got 5" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:07.465Z", + "passed": true, + "duration": 1739 + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:13.671Z", + "passed": true, + "duration": 6205 + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:14.967Z", + "passed": true, + "duration": 1295 + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:10:20.932Z", + "passed": true, + "duration": 5964 + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:10.276Z", + "passed": false, + "duration": 1242, + "reason": "Expected -3,-2, but got -2,-3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:31.650Z", + "passed": false, + "duration": 21368, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:34.699Z", + "passed": true, + "duration": 3046, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:45.957Z", + "passed": true, + "duration": 11256, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:13:47.935Z", + "passed": true, + "duration": 1976, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:07.714Z", + "passed": false, + "duration": 19778, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:08.883Z", + "passed": true, + "duration": 1167, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:12.225Z", + "passed": true, + "duration": 3341, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:12.889Z", + "passed": true, + "duration": 663, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:14:32.527Z", + "passed": true, + "duration": 19636, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:10.419Z", + "passed": false, + "duration": 1650, + "reason": "Expected -3,-2, but got -2,-3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-3,-2" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:20.647Z", + "passed": true, + "duration": 10222, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:21.643Z", + "passed": false, + "duration": 994, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:22.524Z", + "passed": false, + "duration": 878, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:23.496Z", + "passed": true, + "duration": 970, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:28.452Z", + "passed": true, + "duration": 4954, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:29.325Z", + "passed": true, + "duration": 872, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:30.109Z", + "passed": true, + "duration": 782, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:33.902Z", + "passed": true, + "duration": 3791, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:46.225Z", + "passed": false, + "duration": 12322, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:47.202Z", + "passed": false, + "duration": 974, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:48.005Z", + "passed": true, + "duration": 800, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:48.763Z", + "passed": true, + "duration": 756, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:55.510Z", + "passed": true, + "duration": 6745, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:56.297Z", + "passed": true, + "duration": 785, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:57.051Z", + "passed": true, + "duration": 751, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:22:58.294Z", + "passed": true, + "duration": 1241, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [], + "expected": "8", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:23:04.551Z", + "passed": false, + "duration": 6255, + "reason": "Model returned empty response", + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:23:05.297Z", + "passed": true, + "duration": 743, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:23:06.018Z", + "passed": true, + "duration": 719, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-3,-2", + "model": "anthropic/claude-3.5-sonnet", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:32.237Z", + "passed": false, + "duration": 1533, + "reason": "Expected -3,-2, but got -2,-3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-3, -2" + ], + "expected": "-3,-2", + "model": "qwen/qwq-32b", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:50.178Z", + "passed": false, + "duration": 17934, + "reason": "Expected -3,-2, but got -3, -2", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-4o-mini", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:51.040Z", + "passed": false, + "duration": 859, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-3,-2", + "model": "openai/gpt-3.5-turbo", + "router": "openrouter", + "timestamp": "2025-04-02T11:24:51.938Z", + "passed": false, + "duration": 895, + "reason": "Expected -3,-2, but got -2, -3", + "category": "math" + } + ], + "highscores": [ + { + "test": "quadratic", + "rankings": [ + { + "model": "openai/gpt-4o-mini", + "duration": 859, + "duration_secs": 0.859 + }, + { + "model": "openai/gpt-3.5-turbo", + "duration": 895, + "duration_secs": 0.895 + } + ] + }, + { + "test": "factorial", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 782, + "duration_secs": 0.782 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 872, + "duration_secs": 0.872 + } + ] + }, + { + "test": "fibonacci", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 800, + "duration_secs": 0.8 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 974, + "duration_secs": 0.974 + } + ] + }, + { + "test": "square_root", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 751, + "duration_secs": 0.751 + }, + { + "model": "anthropic/claude-3.5-sonnet", + "duration": 756, + "duration_secs": 0.756 + } + ] + }, + { + "test": "power", + "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 719, + "duration_secs": 0.719 + }, + { + "model": "openai/gpt-4o-mini", + "duration": 743, + "duration_secs": 0.743 + } + ] + } + ], + "lastUpdated": "2025-04-02T11:24:51.939Z" +} \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/math.md b/packages/kbot/tests/unit/reports/math.md new file mode 100644 index 00000000..7149f727 --- /dev/null +++ b/packages/kbot/tests/unit/reports/math.md @@ -0,0 +1,206 @@ +# Math Operations Test Results + + +## Highscores + +| Test | Model | Duration (ms) | Duration (s) | +|------|-------|--------------|--------------| +| power | openai/gpt-3.5-turbo | 719 | 0.72 | +| power | openai/gpt-4o-mini | 743 | 0.74 | +| square_root | openai/gpt-3.5-turbo | 751 | 0.75 | +| square_root | anthropic/claude-3.5-sonnet | 756 | 0.76 | +| factorial | openai/gpt-3.5-turbo | 782 | 0.78 | +| square_root | openai/gpt-4o-mini | 785 | 0.79 | +| fibonacci | openai/gpt-3.5-turbo | 800 | 0.80 | +| factorial | openai/gpt-4o-mini | 872 | 0.87 | +| quadratic | openai/gpt-3.5-turbo | 878 | 0.88 | +| factorial | anthropic/claude-3.5-sonnet | 970 | 0.97 | +| fibonacci | openai/gpt-4o-mini | 974 | 0.97 | +| quadratic | openai/gpt-4o-mini | 994 | 0.99 | +| power | anthropic/claude-3.5-sonnet | 1241 | 1.24 | +| quadratic | anthropic/claude-3.5-sonnet | 1650 | 1.65 | +| fibonacci | anthropic/claude-3.5-sonnet | 3791 | 3.79 | +| factorial | qwen/qwq-32b | 4954 | 4.95 | +| power | qwen/qwq-32b | 6255 | 6.25 | +| square_root | qwen/qwq-32b | 6745 | 6.75 | +| quadratic | qwen/qwq-32b | 10222 | 10.22 | +| fibonacci | qwen/qwq-32b | 12322 | 12.32 | + +## Summary + +- Total Tests: 20 +- Passed: 14 +- Failed: 6 +- Success Rate: 70.00% +- Average Duration: 2860ms (2.86s) + +## Failed Tests + +### quadratic - anthropic/claude-3.5-sonnet + +- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` +- Expected: `-3,-2` +- Actual: `-2,-3` +- Duration: 1650ms (1650.00s) +- Reason: Expected -3,-2, but got -2,-3 +- Timestamp: 4/2/2025, 1:22:10 PM + +### quadratic - openai/gpt-4o-mini + +- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` +- Expected: `-3,-2` +- Actual: `-2, -3` +- Duration: 994ms (994.00s) +- Reason: Expected -3,-2, but got -2, -3 +- Timestamp: 4/2/2025, 1:22:21 PM + +### quadratic - openai/gpt-3.5-turbo + +- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` +- Expected: `-3,-2` +- Actual: `-2, -3` +- Duration: 878ms (878.00s) +- Reason: Expected -3,-2, but got -2, -3 +- Timestamp: 4/2/2025, 1:22:22 PM + +### fibonacci - qwen/qwq-32b + +- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` +- Expected: `8` +- Actual: `5` +- Duration: 12322ms (12322.00s) +- Reason: Expected 8, but got 5 +- Timestamp: 4/2/2025, 1:22:46 PM + +### fibonacci - openai/gpt-4o-mini + +- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` +- Expected: `8` +- Actual: `5` +- Duration: 974ms (974.00s) +- Reason: Expected 8, but got 5 +- Timestamp: 4/2/2025, 1:22:47 PM + +### power - qwen/qwq-32b + +- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `` +- Duration: 6255ms (6255.00s) +- Reason: Model returned empty response +- Timestamp: 4/2/2025, 1:23:04 PM + +## Passed Tests + +### quadratic - qwen/qwq-32b + +- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` +- Expected: `-3,-2` +- Actual: `-3,-2` +- Duration: 10222ms (10222.00s) +- Timestamp: 4/2/2025, 1:22:20 PM + +### factorial - anthropic/claude-3.5-sonnet + +- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` +- Expected: `120` +- Actual: `120` +- Duration: 970ms (970.00s) +- Timestamp: 4/2/2025, 1:22:23 PM + +### factorial - qwen/qwq-32b + +- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` +- Expected: `120` +- Actual: `120` +- Duration: 4954ms (4954.00s) +- Timestamp: 4/2/2025, 1:22:28 PM + +### factorial - openai/gpt-4o-mini + +- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` +- Expected: `120` +- Actual: `120` +- Duration: 872ms (872.00s) +- Timestamp: 4/2/2025, 1:22:29 PM + +### factorial - openai/gpt-3.5-turbo + +- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` +- Expected: `120` +- Actual: `120` +- Duration: 782ms (782.00s) +- Timestamp: 4/2/2025, 1:22:30 PM + +### fibonacci - anthropic/claude-3.5-sonnet + +- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 3791ms (3791.00s) +- Timestamp: 4/2/2025, 1:22:33 PM + +### fibonacci - openai/gpt-3.5-turbo + +- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 800ms (800.00s) +- Timestamp: 4/2/2025, 1:22:48 PM + +### square_root - anthropic/claude-3.5-sonnet + +- Prompt: `Calculate the square root of 16. Return only the number, no explanation.` +- Expected: `4` +- Actual: `4` +- Duration: 756ms (756.00s) +- Timestamp: 4/2/2025, 1:22:48 PM + +### square_root - qwen/qwq-32b + +- Prompt: `Calculate the square root of 16. Return only the number, no explanation.` +- Expected: `4` +- Actual: `4` +- Duration: 6745ms (6745.00s) +- Timestamp: 4/2/2025, 1:22:55 PM + +### square_root - openai/gpt-4o-mini + +- Prompt: `Calculate the square root of 16. Return only the number, no explanation.` +- Expected: `4` +- Actual: `4` +- Duration: 785ms (785.00s) +- Timestamp: 4/2/2025, 1:22:56 PM + +### square_root - openai/gpt-3.5-turbo + +- Prompt: `Calculate the square root of 16. Return only the number, no explanation.` +- Expected: `4` +- Actual: `4` +- Duration: 751ms (751.00s) +- Timestamp: 4/2/2025, 1:22:57 PM + +### power - anthropic/claude-3.5-sonnet + +- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 1241ms (1241.00s) +- Timestamp: 4/2/2025, 1:22:58 PM + +### power - openai/gpt-4o-mini + +- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 743ms (743.00s) +- Timestamp: 4/2/2025, 1:23:05 PM + +### power - openai/gpt-3.5-turbo + +- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 719ms (719.00s) +- Timestamp: 4/2/2025, 1:23:06 PM +