From ddf8dbce1fdefbe2eaf2dc7fe772a110c5fcd75b Mon Sep 17 00:00:00 2001 From: babayaga Date: Fri, 4 Apr 2025 14:44:04 +0200 Subject: [PATCH] tests: quasar - alpha --- packages/kbot/dist/0c550cfc34328e29d9df.js | 92 ++++ packages/kbot/logs/params.json | 4 +- packages/kbot/package.json | 2 + packages/kbot/tests/unit/coding.test.ts | 122 +++++ packages/kbot/tests/unit/commons.ts | 3 +- packages/kbot/tests/unit/reports/all.json | 541 ++++++++++++++++++-- packages/kbot/tests/unit/reports/basic.json | 205 +++++++- packages/kbot/tests/unit/reports/basic.md | 115 +++-- packages/kbot/tests/unit/reports/math.json | 338 +++++++++++- packages/kbot/tests/unit/reports/math.md | 209 ++++---- 10 files changed, 1393 insertions(+), 238 deletions(-) create mode 100644 packages/kbot/dist/0c550cfc34328e29d9df.js create mode 100644 packages/kbot/tests/unit/coding.test.ts diff --git a/packages/kbot/dist/0c550cfc34328e29d9df.js b/packages/kbot/dist/0c550cfc34328e29d9df.js new file mode 100644 index 00000000..c8edaa09 --- /dev/null +++ b/packages/kbot/dist/0c550cfc34328e29d9df.js @@ -0,0 +1,92 @@ +import chalk from 'chalk'; +import * as path from 'node:path'; +import { sync as read } from '@polymech/fs/read'; +import { sync as exists } from '@polymech/fs/exists'; +import { logger, module_root } from '../index.js'; +import { CACHE_PATH as OPENROUTER_CACHE_PATH, fetchOpenRouterModels, listModelsAsStrings as listOpenRouterModelsAsStrings } from './openrouter.js'; +import { CACHE_PATH as OPENAI_CACHE_PATH, listModelsAsStrings as listOpenAIModelsAsStrings } from './openai.js'; +import { fetchOpenAIModels } from '../models/openai.js'; +import { CONFIG_DEFAULT } from '@polymech/commons'; +import { models as OpenAIModels } from './cache/openai.js'; +import { models as OpenRouterModels } from './cache/openrouter.js'; +export const models_dist = () => { + let or_models = OpenRouterModels; + let oai_models = OpenAIModels; + let deepseek_models = [ + { + "id": "deepseek-chat", + "name": "deepseek-chat" + }, + { + "id": "deepseek-reasoner", + "name": "deepseek-reasoner" + }, + ]; + const modelsOpenAIPath = path.resolve(module_root(), 'openai.json'); + if (exists(modelsOpenAIPath)) { + oai_models = read(modelsOpenAIPath, 'json'); + } + const modelsRouterPath = path.resolve(module_root(), 'openrouter.json'); + if (exists(modelsRouterPath)) { + or_models = read(modelsRouterPath, 'json'); + } + const models = []; + models.push(chalk.magenta.bold('\n OpenRouter models:\n')); + models.push(...listOpenRouterModelsAsStrings(or_models)); + models.push(chalk.magenta.bold('\n OpenAI models:\n')); + models.push(...listOpenAIModelsAsStrings(oai_models)); + models.push('-----\n'); + models.push(chalk.magenta.bold('\n Deepseek models:\n')); + models.push(...listOpenAIModelsAsStrings(deepseek_models)); + models.push('-----\n'); + return models; +}; +export const models = () => { + const models = []; + const openRouterPath = path.resolve(OPENROUTER_CACHE_PATH); + if (!exists(openRouterPath)) { + fetchOpenRouterModels(); + } + else { + const modelData = read(openRouterPath, 'json'); + models.push(chalk.magenta.bold('\n OpenRouter models:\n')); + models.push(...listOpenRouterModelsAsStrings(modelData.models)); + } + logger.debug('Openrouter models cache: ', OPENAI_CACHE_PATH); + const openAIPath = path.resolve(OPENAI_CACHE_PATH); + const config = CONFIG_DEFAULT(); + if (!exists(openAIPath) && config?.openai?.key) { + fetchOpenAIModels(config.openai.key); + } + if (exists(openAIPath)) { + const modelData = read(openAIPath, 'json'); + models.push(chalk.magenta.bold('\n OpenAI models:\n')); + models.push(...listOpenAIModelsAsStrings(modelData.models)); + } + logger.debug('OpenAI models cache: ', OPENAI_CACHE_PATH); + models.push('-----\n'); + return models; +}; +export const all = () => { + let models = []; + const openRouterPath = path.resolve(OPENROUTER_CACHE_PATH); + if (!exists(openRouterPath)) { + fetchOpenRouterModels(); + } + else { + const modelData = read(openRouterPath, 'json'); + models = models.concat(modelData.models); + } + const openAIPath = path.resolve(OPENAI_CACHE_PATH); + const config = CONFIG_DEFAULT(); + if (!exists(openAIPath) && config?.openai?.key) { + fetchOpenAIModels(config.openai.key); + } + if (exists(openAIPath)) { + const modelData = read(openAIPath, 'json'); + models.push(chalk.magenta.bold('\n OpenAI models:\n')); + models = models.concat(modelData.models); + } + return models; +}; +//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvbW9kZWxzL2luZGV4LnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUFBLE9BQU8sS0FBSyxNQUFNLE9BQU8sQ0FBQTtBQUN6QixPQUFPLEtBQUssSUFBSSxNQUFNLFdBQVcsQ0FBQTtBQUNqQyxPQUFPLEVBQUUsSUFBSSxJQUFJLElBQUksRUFBRSxNQUFNLG1CQUFtQixDQUFBO0FBQ2hELE9BQU8sRUFBRSxJQUFJLElBQUksTUFBTSxFQUFFLE1BQU0scUJBQXFCLENBQUE7QUFFcEQsT0FBTyxFQUFFLE1BQU0sRUFBRSxXQUFXLEVBQUUsTUFBTSxhQUFhLENBQUE7QUFDakQsT0FBTyxFQUFFLFVBQVUsSUFBSSxxQkFBcUIsRUFBMEMscUJBQXFCLEVBQUUsbUJBQW1CLElBQUksNkJBQTZCLEVBQUUsTUFBTSxpQkFBaUIsQ0FBQTtBQUMxTCxPQUFPLEVBQUUsVUFBVSxJQUFJLGlCQUFpQixFQUFzQyxtQkFBbUIsSUFBSSx5QkFBeUIsRUFBRSxNQUFNLGFBQWEsQ0FBQTtBQUVuSixPQUFPLEVBQUUsaUJBQWlCLEVBQUUsTUFBTSxxQkFBcUIsQ0FBQTtBQUN2RCxPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFFbEQsT0FBTyxFQUFFLE1BQU0sSUFBSSxZQUFZLEVBQUUsTUFBTSxtQkFBbUIsQ0FBQTtBQUMxRCxPQUFPLEVBQUUsTUFBTSxJQUFJLGdCQUFnQixFQUFFLE1BQU0sdUJBQXVCLENBQUE7QUFFbEUsTUFBTSxDQUFDLE1BQU0sV0FBVyxHQUFHLEdBQUcsRUFBRTtJQUU5QixJQUFJLFNBQVMsR0FBRyxnQkFBZ0IsQ0FBQTtJQUNoQyxJQUFJLFVBQVUsR0FBRyxZQUFZLENBQUE7SUFDN0IsSUFBSSxlQUFlLEdBQUc7UUFDcEI7WUFDRSxJQUFJLEVBQUUsZUFBZTtZQUNyQixNQUFNLEVBQUUsZUFBZTtTQUN4QjtRQUNEO1lBQ0UsSUFBSSxFQUFFLG1CQUFtQjtZQUN6QixNQUFNLEVBQUUsbUJBQW1CO1NBQzVCO0tBQ0YsQ0FBQTtJQUVELE1BQU0sZ0JBQWdCLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxXQUFXLEVBQUUsRUFBRSxhQUFhLENBQUMsQ0FBQTtJQUNuRSxJQUFJLE1BQU0sQ0FBQyxnQkFBZ0IsQ0FBQyxFQUFFLENBQUM7UUFDN0IsVUFBVSxHQUFHLElBQUksQ0FBQyxnQkFBZ0IsRUFBRSxNQUFNLENBQVEsQ0FBQTtJQUNwRCxDQUFDO0lBRUQsTUFBTSxnQkFBZ0IsR0FBRyxJQUFJLENBQUMsT0FBTyxDQUFDLFdBQVcsRUFBRSxFQUFFLGlCQUFpQixDQUFDLENBQUE7SUFDdkUsSUFBSSxNQUFNLENBQUMsZ0JBQWdCLENBQUMsRUFBRSxDQUFDO1FBQzdCLFNBQVMsR0FBRyxJQUFJLENBQUMsZ0JBQWdCLEVBQUUsTUFBTSxDQUFRLENBQUE7SUFDbkQsQ0FBQztJQUNELE1BQU0sTUFBTSxHQUFhLEVBQUUsQ0FBQTtJQUMzQixNQUFNLENBQUMsSUFBSSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLHlCQUF5QixDQUFDLENBQUMsQ0FBQTtJQUMxRCxNQUFNLENBQUMsSUFBSSxDQUFDLEdBQUcsNkJBQTZCLENBQUMsU0FBZ0IsQ0FBQyxDQUFDLENBQUE7SUFFL0QsTUFBTSxDQUFDLElBQUksQ0FBQyxLQUFLLENBQUMsT0FBTyxDQUFDLElBQUksQ0FBQyxxQkFBcUIsQ0FBQyxDQUFDLENBQUE7SUFDdEQsTUFBTSxDQUFDLElBQUksQ0FBQyxHQUFHLHlCQUF5QixDQUFDLFVBQWlCLENBQUMsQ0FBQyxDQUFBO0lBQzVELE1BQU0sQ0FBQyxJQUFJLENBQUMsU0FBUyxDQUFDLENBQUE7SUFFdEIsTUFBTSxDQUFDLElBQUksQ0FBQyxLQUFLLENBQUMsT0FBTyxDQUFDLElBQUksQ0FBQyx1QkFBdUIsQ0FBQyxDQUFDLENBQUE7SUFDeEQsTUFBTSxDQUFDLElBQUksQ0FBQyxHQUFHLHlCQUF5QixDQUFDLGVBQXNCLENBQUMsQ0FBQyxDQUFBO0lBQ2pFLE1BQU0sQ0FBQyxJQUFJLENBQUMsU0FBUyxDQUFDLENBQUE7SUFDdEIsT0FBTyxNQUFNLENBQUE7QUFDZixDQUFDLENBQUE7QUFFRCxNQUFNLENBQUMsTUFBTSxNQUFNLEdBQUcsR0FBRyxFQUFFO0lBQ3pCLE1BQU0sTUFBTSxHQUFhLEVBQUUsQ0FBQTtJQUMzQixNQUFNLGNBQWMsR0FBRyxJQUFJLENBQUMsT0FBTyxDQUFDLHFCQUFxQixDQUFDLENBQUE7SUFDMUQsSUFBSSxDQUFDLE1BQU0sQ0FBQyxjQUFjLENBQUMsRUFBRSxDQUFDO1FBQzVCLHFCQUFxQixFQUFFLENBQUE7SUFDekIsQ0FBQztTQUFJLENBQUM7UUFDSixNQUFNLFNBQVMsR0FBMkIsSUFBSSxDQUFDLGNBQWMsRUFBRSxNQUFNLENBQTJCLENBQUE7UUFDaEcsTUFBTSxDQUFDLElBQUksQ0FBQyxLQUFLLENBQUMsT0FBTyxDQUFDLElBQUksQ0FBQyx5QkFBeUIsQ0FBQyxDQUFDLENBQUE7UUFDMUQsTUFBTSxDQUFDLElBQUksQ0FBQyxHQUFHLDZCQUE2QixDQUFDLFNBQVMsQ0FBQyxNQUFNLENBQUMsQ0FBQyxDQUFBO0lBQ2pFLENBQUM7SUFDRCxNQUFNLENBQUMsS0FBSyxDQUFDLDJCQUEyQixFQUFFLGlCQUFpQixDQUFDLENBQUE7SUFFNUQsTUFBTSxVQUFVLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxpQkFBaUIsQ0FBQyxDQUFBO0lBQ2xELE1BQU0sTUFBTSxHQUFHLGNBQWMsRUFBUyxDQUFBO0lBQ3RDLElBQUksQ0FBQyxNQUFNLENBQUMsVUFBVSxDQUFDLElBQUksTUFBTSxFQUFFLE1BQU0sRUFBRSxHQUFHLEVBQUUsQ0FBQztRQUMvQyxpQkFBaUIsQ0FBQyxNQUFNLENBQUMsTUFBTSxDQUFDLEdBQUcsQ0FBQyxDQUFBO0lBQ3RDLENBQUM7SUFFRCxJQUFJLE1BQU0sQ0FBQyxVQUFVLENBQUMsRUFBRSxDQUFDO1FBQ3ZCLE1BQU0sU0FBUyxHQUF1QixJQUFJLENBQUMsVUFBVSxFQUFFLE1BQU0sQ0FBdUIsQ0FBQTtRQUNwRixNQUFNLENBQUMsSUFBSSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLHFCQUFxQixDQUFDLENBQUMsQ0FBQTtRQUN0RCxNQUFNLENBQUMsSUFBSSxDQUFDLEdBQUcseUJBQXlCLENBQUMsU0FBUyxDQUFDLE1BQU0sQ0FBQyxDQUFDLENBQUE7SUFDN0QsQ0FBQztJQUNELE1BQU0sQ0FBQyxLQUFLLENBQUMsdUJBQXVCLEVBQUUsaUJBQWlCLENBQUMsQ0FBQTtJQUN4RCxNQUFNLENBQUMsSUFBSSxDQUFDLFNBQVMsQ0FBQyxDQUFBO0lBQ3RCLE9BQU8sTUFBTSxDQUFBO0FBQ2YsQ0FBQyxDQUFBO0FBRUQsTUFBTSxDQUFDLE1BQU0sR0FBRyxHQUFHLEdBQUcsRUFBRTtJQUN0QixJQUFJLE1BQU0sR0FBVSxFQUFFLENBQUE7SUFDdEIsTUFBTSxjQUFjLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxxQkFBcUIsQ0FBQyxDQUFBO0lBQzFELElBQUksQ0FBQyxNQUFNLENBQUMsY0FBYyxDQUFDLEVBQUUsQ0FBQztRQUM1QixxQkFBcUIsRUFBRSxDQUFBO0lBQ3pCLENBQUM7U0FBSSxDQUFDO1FBQ0osTUFBTSxTQUFTLEdBQTJCLElBQUksQ0FBQyxjQUFjLEVBQUUsTUFBTSxDQUEyQixDQUFBO1FBQ2hHLE1BQU0sR0FBRyxNQUFNLENBQUMsTUFBTSxDQUFDLFNBQVMsQ0FBQyxNQUFNLENBQUMsQ0FBQTtJQUMxQyxDQUFDO0lBQ0QsTUFBTSxVQUFVLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxpQkFBaUIsQ0FBQyxDQUFBO0lBQ2xELE1BQU0sTUFBTSxHQUFHLGNBQWMsRUFBUyxDQUFBO0lBQ3RDLElBQUksQ0FBQyxNQUFNLENBQUMsVUFBVSxDQUFDLElBQUksTUFBTSxFQUFFLE1BQU0sRUFBRSxHQUFHLEVBQUUsQ0FBQztRQUMvQyxpQkFBaUIsQ0FBQyxNQUFNLENBQUMsTUFBTSxDQUFDLEdBQUcsQ0FBQyxDQUFBO0lBQ3RDLENBQUM7SUFFRCxJQUFJLE1BQU0sQ0FBQyxVQUFVLENBQUMsRUFBRSxDQUFDO1FBQ3ZCLE1BQU0sU0FBUyxHQUF1QixJQUFJLENBQUMsVUFBVSxFQUFFLE1BQU0sQ0FBdUIsQ0FBQTtRQUNwRixNQUFNLENBQUMsSUFBSSxDQUFDLEtBQUssQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLHFCQUFxQixDQUFDLENBQUMsQ0FBQTtRQUN0RCxNQUFNLEdBQUcsTUFBTSxDQUFDLE1BQU0sQ0FBQyxTQUFTLENBQUMsTUFBTSxDQUFDLENBQUE7SUFDMUMsQ0FBQztJQUNELE9BQU8sTUFBTSxDQUFBO0FBQ2YsQ0FBQyxDQUFBIn0= \ No newline at end of file diff --git a/packages/kbot/logs/params.json b/packages/kbot/logs/params.json index 2926ab73..01754106 100644 --- a/packages/kbot/logs/params.json +++ b/packages/kbot/logs/params.json @@ -1,9 +1,9 @@ { - "model": "openai/gpt-4o-mini", + "model": "openrouter/quasar-alpha", "messages": [ { "role": "user", - "content": "divide 15 by 3. Return only the number, no explanation." + "content": "Calculate 2 raised to the power of 3. Return only the number, no explanation." }, { "role": "user", diff --git a/packages/kbot/package.json b/packages/kbot/package.json index 03810886..13766ff5 100644 --- a/packages/kbot/package.json +++ b/packages/kbot/package.json @@ -23,6 +23,7 @@ "test:seo": "vitest run tests/unit/seo.test.ts", "test:language": "vitest run tests/unit/language.test.ts", "test:tools": "vitest run tests/unit/tools.test.ts", + "test:coding": "vitest run tests/unit/coding.test.ts", "test2:watch": "vitest", "test2:coverage": "vitest run --coverage", "webpack": "webpack --config webpack.config.js --stats-error-details", @@ -49,6 +50,7 @@ "p-map": "7.0.3", "ts-retry": "6.0.0", "tslog": "^4.9.3", + "vm2": "^3.9.19", "yargs": "17.7.2", "zod": "3.24.2" }, diff --git a/packages/kbot/tests/unit/coding.test.ts b/packages/kbot/tests/unit/coding.test.ts new file mode 100644 index 00000000..c48cf25a --- /dev/null +++ b/packages/kbot/tests/unit/coding.test.ts @@ -0,0 +1,122 @@ +import { describe, it, expect } from 'vitest' +import * as path from 'node:path' +import { sync as exists } from "@polymech/fs/exists" +import { sync as read } from "@polymech/fs/read" +import { sync as write } from "@polymech/fs/write" +import { sync as mkdirp } from "mkdirp" +import { VM } from 'vm2' + +import { + getDefaultModels, + TEST_BASE_PATH, + TEST_LOGS_PATH, + TEST_PREFERENCES_PATH, + TEST_TIMEOUT, + TestResult, + runTest, + generateTestReport, + getReportPaths +} from './commons' + +// Optionally override models for this specific test file +const models = getDefaultModels() + +// Ensure test-data/code directory exists +const TEST_CODE_DIR = path.resolve(__dirname, '../test-data/code') +if (exists(TEST_CODE_DIR) !== 'directory') { + mkdirp(TEST_CODE_DIR) +} + +describe('Coding Capabilities', () => { + let testResults: TestResult[] = [] + const TEST_LOG_PATH = getReportPaths('coding', 'json') + const TEST_REPORT_PATH = getReportPaths('coding', 'md') + + const executeCode = (code: string, context: any = {}): any => { + const vm = new VM({ + timeout: 1000, + sandbox: context + }) + return vm.run(code) + } + + it.each(models)('should generate and execute a simple function with model %s', async (modelName) => { + const prompt = `Generate a JavaScript function that adds two numbers and returns the result. + The function should be named 'add' and take two parameters 'a' and 'b'. + Return only the function code, no explanation.` + + const result = await runTest( + prompt, + 'function add(a, b) { return a + b; }', + 'simple_function', + modelName, + TEST_LOG_PATH + ) + + testResults.push(result) + + // Save the code to a file + const codePath = path.resolve(TEST_CODE_DIR, 'add.js') + write(codePath, result.result[0]) + + // Execute the code + const context = {} + const addFunction = executeCode(result.result[0], context) + expect(addFunction(5, 3)).toBe(8) + }, { timeout: TEST_TIMEOUT }) + + it.each(models)('should generate and execute a factorial function with model %s', async (modelName) => { + const prompt = `Generate a JavaScript function that calculates the factorial of a number. + The function should be named 'factorial' and take one parameter 'n'. + Return only the function code, no explanation.` + + const result = await runTest( + prompt, + 'function factorial(n) { return n <= 1 ? 1 : n * factorial(n - 1); }', + 'factorial_function', + modelName, + TEST_LOG_PATH + ) + + testResults.push(result) + + // Save the code to a file + const codePath = path.resolve(TEST_CODE_DIR, 'factorial.js') + write(codePath, result.result[0]) + + // Execute the code + const context = {} + const factorialFunction = executeCode(result.result[0], context) + expect(factorialFunction(5)).toBe(120) + }, { timeout: TEST_TIMEOUT }) + + it.each(models)('should generate and execute a fibonacci function with model %s', async (modelName) => { + const prompt = `Generate a JavaScript function that calculates the nth Fibonacci number. + The function should be named 'fibonacci' and take one parameter 'n'. + Return only the function code, no explanation.` + + const result = await runTest( + prompt, + 'function fibonacci(n) { return n <= 1 ? n : fibonacci(n - 1) + fibonacci(n - 2); }', + 'fibonacci_function', + modelName, + TEST_LOG_PATH + ) + + testResults.push(result) + + // Save the code to a file + const codePath = path.resolve(TEST_CODE_DIR, 'fibonacci.js') + write(codePath, result.result[0]) + + // Execute the code + const context = {} + const fibonacciFunction = executeCode(result.result[0], context) + expect(fibonacciFunction(6)).toBe(8) + }, { timeout: TEST_TIMEOUT }) + + it('should generate markdown report', () => { + generateTestReport(testResults, 'Coding Capabilities Test Results', TEST_REPORT_PATH) + expect(exists(TEST_REPORT_PATH) === 'file').toBe(true) + }) +}) \ No newline at end of file diff --git a/packages/kbot/tests/unit/commons.ts b/packages/kbot/tests/unit/commons.ts index fc550a93..18e4e1c1 100644 --- a/packages/kbot/tests/unit/commons.ts +++ b/packages/kbot/tests/unit/commons.ts @@ -17,7 +17,8 @@ export const getFastModels = (): string[] => { return [ E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO, E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE, - E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI + E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI, + E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA ] } diff --git a/packages/kbot/tests/unit/reports/all.json b/packages/kbot/tests/unit/reports/all.json index 6acb99f2..836988a1 100644 --- a/packages/kbot/tests/unit/reports/all.json +++ b/packages/kbot/tests/unit/reports/all.json @@ -7176,6 +7176,457 @@ "passed": true, "duration": 954, "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:36:55.754Z", + "passed": true, + "duration": 1505, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:36:59.232Z", + "passed": true, + "duration": 3470, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:37:00.080Z", + "passed": true, + "duration": 842, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:37:00.897Z", + "passed": true, + "duration": 811, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:37:01.784Z", + "passed": true, + "duration": 881, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [], + "expected": "24", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:37:03.117Z", + "passed": false, + "duration": 1327, + "reason": "Model returned empty response", + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:37:04.222Z", + "passed": true, + "duration": 1096, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:37:05.008Z", + "passed": true, + "duration": 780, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:37:05.799Z", + "passed": true, + "duration": 784, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:37:10.272Z", + "passed": true, + "duration": 4467, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:37:11.255Z", + "passed": true, + "duration": 975, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:37:11.993Z", + "passed": true, + "duration": 731, + "category": "basic" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-2,-3", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:12.580Z", + "passed": true, + "duration": 1229, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nAnswer: -3, -2" + ], + "expected": "-2,-3", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:38:24.221Z", + "passed": false, + "duration": 11633, + "reason": "Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nanswer: -3, -2", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-2,-3", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:38:25.175Z", + "passed": false, + "duration": 943, + "reason": "Expected -2,-3, but got -2, -3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-2,-3", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:38:26.290Z", + "passed": true, + "duration": 1105, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:27.138Z", + "passed": true, + "duration": 838, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:38:34.971Z", + "passed": true, + "duration": 7825, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:38:35.899Z", + "passed": true, + "duration": 920, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:38:36.748Z", + "passed": true, + "duration": 840, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:37.951Z", + "passed": true, + "duration": 1195, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:38:49.318Z", + "passed": false, + "duration": 11358, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:38:50.264Z", + "passed": false, + "duration": 935, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:38:50.973Z", + "passed": true, + "duration": 701, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:51.774Z", + "passed": true, + "duration": 793, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:39:08.114Z", + "passed": true, + "duration": 16332, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:39:09.133Z", + "passed": true, + "duration": 1012, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:39:10.677Z", + "passed": true, + "duration": 1535, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:39:11.607Z", + "passed": true, + "duration": 922, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:39:18.707Z", + "passed": true, + "duration": 7091, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:39:19.719Z", + "passed": true, + "duration": 1004, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:39:21.294Z", + "passed": true, + "duration": 1567, + "category": "math" } ], "highscores": [ @@ -7184,13 +7635,13 @@ "rankings": [ { "model": "openai/gpt-4o-mini", - "duration": 935, - "duration_secs": 0.935 + "duration": 943, + "duration_secs": 0.943 }, { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1435, - "duration_secs": 1.435 + "model": "openrouter/quasar-alpha", + "duration": 1105, + "duration_secs": 1.105 } ] }, @@ -7198,14 +7649,14 @@ "test": "factorial", "rankings": [ { - "model": "openai/gpt-4o-mini", - "duration": 861, - "duration_secs": 0.861 + "model": "openai/gpt-3.5-turbo", + "duration": 838, + "duration_secs": 0.838 }, { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1454, - "duration_secs": 1.454 + "model": "openrouter/quasar-alpha", + "duration": 840, + "duration_secs": 0.84 } ] }, @@ -7213,29 +7664,29 @@ "test": "fibonacci", "rankings": [ { - "model": "openai/gpt-4o-mini", - "duration": 756, - "duration_secs": 0.756 + "model": "openrouter/quasar-alpha", + "duration": 701, + "duration_secs": 0.701 }, { - "model": "openai/gpt-3.5-turbo", - "duration": 792, - "duration_secs": 0.792 + "model": "openai/gpt-4o-mini", + "duration": 935, + "duration_secs": 0.935 } ] }, { "test": "square_root", "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 793, + "duration_secs": 0.793 + }, { "model": "anthropic/claude-3.5-sonnet", "duration": 819, "duration_secs": 0.819 - }, - { - "model": "openai/gpt-4o-mini", - "duration": 828, - "duration_secs": 0.828 } ] }, @@ -7244,13 +7695,13 @@ "rankings": [ { "model": "openai/gpt-3.5-turbo", - "duration": 795, - "duration_secs": 0.795 + "duration": 922, + "duration_secs": 0.922 }, { "model": "openai/gpt-4o-mini", - "duration": 966, - "duration_secs": 0.966 + "duration": 1004, + "duration_secs": 1.004 } ] }, @@ -7258,14 +7709,14 @@ "test": "addition", "rankings": [ { - "model": "openai/gpt-4o-mini", - "duration": 910, - "duration_secs": 0.91 + "model": "openrouter/quasar-alpha", + "duration": 811, + "duration_secs": 0.811 }, { - "model": "openai/gpt-3.5-turbo", - "duration": 1484, - "duration_secs": 1.484 + "model": "openai/gpt-4o-mini", + "duration": 842, + "duration_secs": 0.842 } ] }, @@ -7273,14 +7724,14 @@ "test": "multiplication", "rankings": [ { - "model": "openai/gpt-3.5-turbo", - "duration": 955, - "duration_secs": 0.955 + "model": "openrouter/quasar-alpha", + "duration": 780, + "duration_secs": 0.78 }, { - "model": "openai/gpt-4o-mini", - "duration": 1095, - "duration_secs": 1.095 + "model": "openai/gpt-3.5-turbo", + "duration": 881, + "duration_secs": 0.881 } ] }, @@ -7288,14 +7739,14 @@ "test": "division", "rankings": [ { - "model": "openai/gpt-3.5-turbo", - "duration": 816, - "duration_secs": 0.816 + "model": "openrouter/quasar-alpha", + "duration": 731, + "duration_secs": 0.731 }, { - "model": "qwen/qwq-32b", - "duration": 917, - "duration_secs": 0.917 + "model": "openai/gpt-3.5-turbo", + "duration": 784, + "duration_secs": 0.784 } ] }, @@ -7360,5 +7811,5 @@ ] } ], - "lastUpdated": "2025-04-03T17:14:41.955Z" + "lastUpdated": "2025-04-04T12:39:21.300Z" } \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/basic.json b/packages/kbot/tests/unit/reports/basic.json index 5992eb5a..911017e3 100644 --- a/packages/kbot/tests/unit/reports/basic.json +++ b/packages/kbot/tests/unit/reports/basic.json @@ -1251,6 +1251,173 @@ "passed": true, "duration": 954, "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:36:55.754Z", + "passed": true, + "duration": 1505, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:36:59.232Z", + "passed": true, + "duration": 3470, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:37:00.080Z", + "passed": true, + "duration": 842, + "category": "basic" + }, + { + "test": "addition", + "prompt": "add 5 and 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:37:00.897Z", + "passed": true, + "duration": 811, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:37:01.784Z", + "passed": true, + "duration": 881, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [], + "expected": "24", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:37:03.117Z", + "passed": false, + "duration": 1327, + "reason": "Model returned empty response", + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:37:04.222Z", + "passed": true, + "duration": 1096, + "category": "basic" + }, + { + "test": "multiplication", + "prompt": "multiply 8 and 3. Return only the number, no explanation.", + "result": [ + "24" + ], + "expected": "24", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:37:05.008Z", + "passed": true, + "duration": 780, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:37:05.799Z", + "passed": true, + "duration": 784, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:37:10.272Z", + "passed": true, + "duration": 4467, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:37:11.255Z", + "passed": true, + "duration": 975, + "category": "basic" + }, + { + "test": "division", + "prompt": "divide 15 by 3. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "5", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:37:11.993Z", + "passed": true, + "duration": 731, + "category": "basic" } ], "highscores": [ @@ -1258,14 +1425,14 @@ "test": "addition", "rankings": [ { - "model": "openai/gpt-4o-mini", - "duration": 910, - "duration_secs": 0.91 + "model": "openrouter/quasar-alpha", + "duration": 811, + "duration_secs": 0.811 }, { - "model": "openai/gpt-3.5-turbo", - "duration": 1484, - "duration_secs": 1.484 + "model": "openai/gpt-4o-mini", + "duration": 842, + "duration_secs": 0.842 } ] }, @@ -1273,14 +1440,14 @@ "test": "multiplication", "rankings": [ { - "model": "openai/gpt-3.5-turbo", - "duration": 955, - "duration_secs": 0.955 + "model": "openrouter/quasar-alpha", + "duration": 780, + "duration_secs": 0.78 }, { - "model": "openai/gpt-4o-mini", - "duration": 1095, - "duration_secs": 1.095 + "model": "openai/gpt-3.5-turbo", + "duration": 881, + "duration_secs": 0.881 } ] }, @@ -1288,17 +1455,17 @@ "test": "division", "rankings": [ { - "model": "openai/gpt-3.5-turbo", - "duration": 816, - "duration_secs": 0.816 + "model": "openrouter/quasar-alpha", + "duration": 731, + "duration_secs": 0.731 }, { - "model": "qwen/qwq-32b", - "duration": 917, - "duration_secs": 0.917 + "model": "openai/gpt-3.5-turbo", + "duration": 784, + "duration_secs": 0.784 } ] } ], - "lastUpdated": "2025-04-03T17:14:41.951Z" + "lastUpdated": "2025-04-04T12:37:11.994Z" } \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/basic.md b/packages/kbot/tests/unit/reports/basic.md index 01c15e3b..3f5c6383 100644 --- a/packages/kbot/tests/unit/reports/basic.md +++ b/packages/kbot/tests/unit/reports/basic.md @@ -6,38 +6,37 @@ | Test | Model | Duration (ms) | Duration (s) | |------|-------|--------------|--------------| -| addition | openai/gpt-4o-mini | 910 | 0.91 | -| addition | openai/gpt-3.5-turbo | 1484 | 1.48 | -| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 8460 | 8.46 | -| multiplication | openai/gpt-3.5-turbo | 955 | 0.95 | -| multiplication | openai/gpt-4o-mini | 1095 | 1.09 | -| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 7653 | 7.65 | -| division | openai/gpt-3.5-turbo | 816 | 0.82 | -| division | openai/gpt-4o-mini | 954 | 0.95 | -| division | deepseek/deepseek-r1-distill-qwen-14b:free | 16655 | 16.66 | +| addition | openrouter/quasar-alpha | 811 | 0.81 | +| addition | openai/gpt-4o-mini | 842 | 0.84 | +| addition | openai/gpt-3.5-turbo | 1505 | 1.50 | +| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 3470 | 3.47 | +| multiplication | openrouter/quasar-alpha | 780 | 0.78 | +| multiplication | openai/gpt-3.5-turbo | 881 | 0.88 | +| multiplication | openai/gpt-4o-mini | 1096 | 1.10 | +| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1327 | 1.33 | +| division | openrouter/quasar-alpha | 731 | 0.73 | +| division | openai/gpt-3.5-turbo | 784 | 0.78 | +| division | openai/gpt-4o-mini | 975 | 0.97 | +| division | deepseek/deepseek-r1-distill-qwen-14b:free | 4467 | 4.47 | ## Summary -- Total Tests: 9 -- Passed: 8 +- Total Tests: 12 +- Passed: 11 - Failed: 1 -- Success Rate: 88.89% -- Average Duration: 4331ms (4.33s) +- Success Rate: 91.67% +- Average Duration: 1472ms (1.47s) ## Failed Tests -### division - deepseek/deepseek-r1-distill-qwen-14b:free +### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free -- Prompt: `divide 15 by 3. Return only the number, no explanation.` -- Expected: `5` -- Actual: `15 divided by 3 is 5. - -Answer: 5` -- Duration: 16655ms (16.66s) -- Reason: Expected 5, but got 15 divided by 3 is 5. - -answer: 5 -- Timestamp: 4/3/2025, 7:14:40 PM +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `` +- Duration: 1327ms (1.33s) +- Reason: Model returned empty response +- Timestamp: 4/4/2025, 2:37:03 PM ## Passed Tests @@ -46,62 +45,86 @@ answer: 5 - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 1484ms (1.48s) -- Timestamp: 4/3/2025, 7:14:04 PM +- Duration: 1505ms (1.50s) +- Timestamp: 4/4/2025, 2:36:55 PM ### addition - deepseek/deepseek-r1-distill-qwen-14b:free - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 8460ms (8.46s) -- Timestamp: 4/3/2025, 7:14:12 PM +- Duration: 3470ms (3.47s) +- Timestamp: 4/4/2025, 2:36:59 PM ### addition - openai/gpt-4o-mini - Prompt: `add 5 and 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 910ms (0.91s) -- Timestamp: 4/3/2025, 7:14:13 PM +- Duration: 842ms (0.84s) +- Timestamp: 4/4/2025, 2:37:00 PM + +### addition - openrouter/quasar-alpha + +- Prompt: `add 5 and 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 811ms (0.81s) +- Timestamp: 4/4/2025, 2:37:00 PM ### multiplication - openai/gpt-3.5-turbo - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Duration: 955ms (0.95s) -- Timestamp: 4/3/2025, 7:14:14 PM - -### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free - -- Prompt: `multiply 8 and 3. Return only the number, no explanation.` -- Expected: `24` -- Actual: `24` -- Duration: 7653ms (7.65s) -- Timestamp: 4/3/2025, 7:14:22 PM +- Duration: 881ms (0.88s) +- Timestamp: 4/4/2025, 2:37:01 PM ### multiplication - openai/gpt-4o-mini - Prompt: `multiply 8 and 3. Return only the number, no explanation.` - Expected: `24` - Actual: `24` -- Duration: 1095ms (1.09s) -- Timestamp: 4/3/2025, 7:14:23 PM +- Duration: 1096ms (1.10s) +- Timestamp: 4/4/2025, 2:37:04 PM + +### multiplication - openrouter/quasar-alpha + +- Prompt: `multiply 8 and 3. Return only the number, no explanation.` +- Expected: `24` +- Actual: `24` +- Duration: 780ms (0.78s) +- Timestamp: 4/4/2025, 2:37:05 PM ### division - openai/gpt-3.5-turbo - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Duration: 816ms (0.82s) -- Timestamp: 4/3/2025, 7:14:24 PM +- Duration: 784ms (0.78s) +- Timestamp: 4/4/2025, 2:37:05 PM + +### division - deepseek/deepseek-r1-distill-qwen-14b:free + +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Duration: 4467ms (4.47s) +- Timestamp: 4/4/2025, 2:37:10 PM ### division - openai/gpt-4o-mini - Prompt: `divide 15 by 3. Return only the number, no explanation.` - Expected: `5` - Actual: `5` -- Duration: 954ms (0.95s) -- Timestamp: 4/3/2025, 7:14:41 PM +- Duration: 975ms (0.97s) +- Timestamp: 4/4/2025, 2:37:11 PM + +### division - openrouter/quasar-alpha + +- Prompt: `divide 15 by 3. Return only the number, no explanation.` +- Expected: `5` +- Actual: `5` +- Duration: 731ms (0.73s) +- Timestamp: 4/4/2025, 2:37:11 PM diff --git a/packages/kbot/tests/unit/reports/math.json b/packages/kbot/tests/unit/reports/math.json index dea69c08..a50d2054 100644 --- a/packages/kbot/tests/unit/reports/math.json +++ b/packages/kbot/tests/unit/reports/math.json @@ -2828,6 +2828,290 @@ "passed": true, "duration": 966, "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-2,-3", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:12.580Z", + "passed": true, + "duration": 1229, + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nAnswer: -3, -2" + ], + "expected": "-2,-3", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:38:24.221Z", + "passed": false, + "duration": 11633, + "reason": "Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nanswer: -3, -2", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2, -3" + ], + "expected": "-2,-3", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:38:25.175Z", + "passed": false, + "duration": 943, + "reason": "Expected -2,-3, but got -2, -3", + "category": "math" + }, + { + "test": "quadratic", + "prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.", + "result": [ + "-2,-3" + ], + "expected": "-2,-3", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:38:26.290Z", + "passed": true, + "duration": 1105, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:27.138Z", + "passed": true, + "duration": 838, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:38:34.971Z", + "passed": true, + "duration": 7825, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:38:35.899Z", + "passed": true, + "duration": 920, + "category": "math" + }, + { + "test": "factorial", + "prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.", + "result": [ + "120" + ], + "expected": "120", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:38:36.748Z", + "passed": true, + "duration": 840, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:37.951Z", + "passed": true, + "duration": 1195, + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:38:49.318Z", + "passed": false, + "duration": 11358, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "5" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:38:50.264Z", + "passed": false, + "duration": 935, + "reason": "Expected 8, but got 5", + "category": "math" + }, + { + "test": "fibonacci", + "prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:38:50.973Z", + "passed": true, + "duration": 701, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:38:51.774Z", + "passed": true, + "duration": 793, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:39:08.114Z", + "passed": true, + "duration": 16332, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:39:09.133Z", + "passed": true, + "duration": 1012, + "category": "math" + }, + { + "test": "square_root", + "prompt": "Calculate the square root of 16. Return only the number, no explanation.", + "result": [ + "4" + ], + "expected": "4", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:39:10.677Z", + "passed": true, + "duration": 1535, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-3.5-turbo", + "router": "openai/gpt-3.5-turbo", + "timestamp": "2025-04-04T12:39:11.607Z", + "passed": true, + "duration": 922, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "deepseek/deepseek-r1-distill-qwen-14b:free", + "router": "deepseek/deepseek-r1-distill-qwen-14b:free", + "timestamp": "2025-04-04T12:39:18.707Z", + "passed": true, + "duration": 7091, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openai/gpt-4o-mini", + "router": "openai/gpt-4o-mini", + "timestamp": "2025-04-04T12:39:19.719Z", + "passed": true, + "duration": 1004, + "category": "math" + }, + { + "test": "power", + "prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.", + "result": [ + "8" + ], + "expected": "8", + "model": "openrouter/quasar-alpha", + "router": "openrouter/quasar-alpha", + "timestamp": "2025-04-04T12:39:21.294Z", + "passed": true, + "duration": 1567, + "category": "math" } ], "highscores": [ @@ -2836,13 +3120,13 @@ "rankings": [ { "model": "openai/gpt-4o-mini", - "duration": 935, - "duration_secs": 0.935 + "duration": 943, + "duration_secs": 0.943 }, { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1435, - "duration_secs": 1.435 + "model": "openrouter/quasar-alpha", + "duration": 1105, + "duration_secs": 1.105 } ] }, @@ -2850,14 +3134,14 @@ "test": "factorial", "rankings": [ { - "model": "openai/gpt-4o-mini", - "duration": 861, - "duration_secs": 0.861 + "model": "openai/gpt-3.5-turbo", + "duration": 838, + "duration_secs": 0.838 }, { - "model": "anthropic/claude-3.5-sonnet", - "duration": 1454, - "duration_secs": 1.454 + "model": "openrouter/quasar-alpha", + "duration": 840, + "duration_secs": 0.84 } ] }, @@ -2865,29 +3149,29 @@ "test": "fibonacci", "rankings": [ { - "model": "openai/gpt-4o-mini", - "duration": 756, - "duration_secs": 0.756 + "model": "openrouter/quasar-alpha", + "duration": 701, + "duration_secs": 0.701 }, { - "model": "openai/gpt-3.5-turbo", - "duration": 792, - "duration_secs": 0.792 + "model": "openai/gpt-4o-mini", + "duration": 935, + "duration_secs": 0.935 } ] }, { "test": "square_root", "rankings": [ + { + "model": "openai/gpt-3.5-turbo", + "duration": 793, + "duration_secs": 0.793 + }, { "model": "anthropic/claude-3.5-sonnet", "duration": 819, "duration_secs": 0.819 - }, - { - "model": "openai/gpt-4o-mini", - "duration": 828, - "duration_secs": 0.828 } ] }, @@ -2896,16 +3180,16 @@ "rankings": [ { "model": "openai/gpt-3.5-turbo", - "duration": 795, - "duration_secs": 0.795 + "duration": 922, + "duration_secs": 0.922 }, { "model": "openai/gpt-4o-mini", - "duration": 966, - "duration_secs": 0.966 + "duration": 1004, + "duration_secs": 1.004 } ] } ], - "lastUpdated": "2025-04-03T17:10:26.897Z" + "lastUpdated": "2025-04-04T12:39:21.296Z" } \ No newline at end of file diff --git a/packages/kbot/tests/unit/reports/math.md b/packages/kbot/tests/unit/reports/math.md index b9097b9d..a8758a13 100644 --- a/packages/kbot/tests/unit/reports/math.md +++ b/packages/kbot/tests/unit/reports/math.md @@ -6,191 +6,204 @@ | Test | Model | Duration (ms) | Duration (s) | |------|-------|--------------|--------------| -| quadratic | openai/gpt-4o-mini | 935 | 0.94 | -| quadratic | openai/gpt-3.5-turbo | 1685 | 1.69 | -| quadratic | deepseek/deepseek-r1-distill-qwen-14b:free | 10827 | 10.83 | -| factorial | openai/gpt-4o-mini | 861 | 0.86 | -| factorial | openai/gpt-3.5-turbo | 3991 | 3.99 | -| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 9116 | 9.12 | -| fibonacci | openai/gpt-4o-mini | 756 | 0.76 | -| fibonacci | openai/gpt-3.5-turbo | 792 | 0.79 | -| fibonacci | deepseek/deepseek-r1-distill-qwen-14b:free | 8292 | 8.29 | -| square_root | openai/gpt-4o-mini | 828 | 0.83 | -| square_root | openai/gpt-3.5-turbo | 892 | 0.89 | -| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 1755 | 1.75 | -| power | openai/gpt-3.5-turbo | 795 | 0.80 | -| power | openai/gpt-4o-mini | 966 | 0.97 | -| power | deepseek/deepseek-r1-distill-qwen-14b:free | 7263 | 7.26 | +| quadratic | openai/gpt-4o-mini | 943 | 0.94 | +| quadratic | openrouter/quasar-alpha | 1105 | 1.10 | +| quadratic | openai/gpt-3.5-turbo | 1229 | 1.23 | +| quadratic | deepseek/deepseek-r1-distill-qwen-14b:free | 11633 | 11.63 | +| factorial | openai/gpt-3.5-turbo | 838 | 0.84 | +| factorial | openrouter/quasar-alpha | 840 | 0.84 | +| factorial | openai/gpt-4o-mini | 920 | 0.92 | +| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 7825 | 7.83 | +| fibonacci | openrouter/quasar-alpha | 701 | 0.70 | +| fibonacci | openai/gpt-4o-mini | 935 | 0.94 | +| fibonacci | openai/gpt-3.5-turbo | 1195 | 1.20 | +| fibonacci | deepseek/deepseek-r1-distill-qwen-14b:free | 11358 | 11.36 | +| square_root | openai/gpt-3.5-turbo | 793 | 0.79 | +| square_root | openai/gpt-4o-mini | 1012 | 1.01 | +| square_root | openrouter/quasar-alpha | 1535 | 1.53 | +| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 16332 | 16.33 | +| power | openai/gpt-3.5-turbo | 922 | 0.92 | +| power | openai/gpt-4o-mini | 1004 | 1.00 | +| power | openrouter/quasar-alpha | 1567 | 1.57 | +| power | deepseek/deepseek-r1-distill-qwen-14b:free | 7091 | 7.09 | ## Summary -- Total Tests: 15 -- Passed: 9 -- Failed: 6 -- Success Rate: 60.00% -- Average Duration: 3317ms (3.32s) +- Total Tests: 20 +- Passed: 16 +- Failed: 4 +- Success Rate: 80.00% +- Average Duration: 3489ms (3.49s) ## Failed Tests -### quadratic - openai/gpt-3.5-turbo - -- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` -- Expected: `-2,-3` -- Actual: `-3, -2` -- Duration: 1685ms (1.69s) -- Reason: Expected -2,-3, but got -3, -2 -- Timestamp: 4/3/2025, 7:09:38 PM - ### quadratic - deepseek/deepseek-r1-distill-qwen-14b:free - Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` - Expected: `-2,-3` -- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are -2 and -3. +- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2. -Specific steps: +Answer: -3, -2` +- Duration: 11633ms (11.63s) +- Reason: Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2. -1. **Identify coefficients**: a = 1, b = 5, c = 6. -2. **Calculate discriminant**: b² - 4ac = 25 - 24 = 1. -3. **Apply quadratic formula**: x = (-5 ± √1)/2. -4. **Compute solutions**: x = (-5 + 1)/2 = -2 and x = (-5 - 1)/2 = -3. - --2, -3 - -Answer: -2,-3` -- Duration: 10827ms (10.83s) -- Reason: Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -2 and -3. - -specific steps: - -1. **identify coefficients**: a = 1, b = 5, c = 6. -2. **calculate discriminant**: b² - 4ac = 25 - 24 = 1. -3. **apply quadratic formula**: x = (-5 ± √1)/2. -4. **compute solutions**: x = (-5 + 1)/2 = -2 and x = (-5 - 1)/2 = -3. - --2, -3 - -answer: -2,-3 -- Timestamp: 4/3/2025, 7:09:49 PM +answer: -3, -2 +- Timestamp: 4/4/2025, 2:38:24 PM ### quadratic - openai/gpt-4o-mini - Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` - Expected: `-2,-3` - Actual: `-2, -3` -- Duration: 935ms (0.94s) +- Duration: 943ms (0.94s) - Reason: Expected -2,-3, but got -2, -3 -- Timestamp: 4/3/2025, 7:09:50 PM - -### factorial - deepseek/deepseek-r1-distill-qwen-14b:free - -- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` -- Expected: `120` -- Actual: `5! = 120 - -The factorial of 5 is calculated as: - -5 × 4 × 3 × 2 × 1 = 120 - -**Answer:** 120` -- Duration: 9116ms (9.12s) -- Reason: Expected 120, but got 5! = 120 - -the factorial of 5 is calculated as: - -5 × 4 × 3 × 2 × 1 = 120 - -**answer:** 120 -- Timestamp: 4/3/2025, 7:10:03 PM +- Timestamp: 4/4/2025, 2:38:25 PM ### fibonacci - deepseek/deepseek-r1-distill-qwen-14b:free - Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` - Expected: `8` - Actual: `5` -- Duration: 8292ms (8.29s) +- Duration: 11358ms (11.36s) - Reason: Expected 8, but got 5 -- Timestamp: 4/3/2025, 7:10:13 PM +- Timestamp: 4/4/2025, 2:38:49 PM ### fibonacci - openai/gpt-4o-mini - Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` - Expected: `8` - Actual: `5` -- Duration: 756ms (0.76s) +- Duration: 935ms (0.94s) - Reason: Expected 8, but got 5 -- Timestamp: 4/3/2025, 7:10:14 PM +- Timestamp: 4/4/2025, 2:38:50 PM ## Passed Tests +### quadratic - openai/gpt-3.5-turbo + +- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` +- Expected: `-2,-3` +- Actual: `-2,-3` +- Duration: 1229ms (1.23s) +- Timestamp: 4/4/2025, 2:38:12 PM + +### quadratic - openrouter/quasar-alpha + +- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.` +- Expected: `-2,-3` +- Actual: `-2,-3` +- Duration: 1105ms (1.10s) +- Timestamp: 4/4/2025, 2:38:26 PM + ### factorial - openai/gpt-3.5-turbo - Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` - Expected: `120` - Actual: `120` -- Duration: 3991ms (3.99s) -- Timestamp: 4/3/2025, 7:09:54 PM +- Duration: 838ms (0.84s) +- Timestamp: 4/4/2025, 2:38:27 PM + +### factorial - deepseek/deepseek-r1-distill-qwen-14b:free + +- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` +- Expected: `120` +- Actual: `120` +- Duration: 7825ms (7.83s) +- Timestamp: 4/4/2025, 2:38:34 PM ### factorial - openai/gpt-4o-mini - Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` - Expected: `120` - Actual: `120` -- Duration: 861ms (0.86s) -- Timestamp: 4/3/2025, 7:10:04 PM +- Duration: 920ms (0.92s) +- Timestamp: 4/4/2025, 2:38:35 PM + +### factorial - openrouter/quasar-alpha + +- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.` +- Expected: `120` +- Actual: `120` +- Duration: 840ms (0.84s) +- Timestamp: 4/4/2025, 2:38:36 PM ### fibonacci - openai/gpt-3.5-turbo - Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 792ms (0.79s) -- Timestamp: 4/3/2025, 7:10:05 PM +- Duration: 1195ms (1.20s) +- Timestamp: 4/4/2025, 2:38:37 PM + +### fibonacci - openrouter/quasar-alpha + +- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 701ms (0.70s) +- Timestamp: 4/4/2025, 2:38:50 PM ### square_root - openai/gpt-3.5-turbo - Prompt: `Calculate the square root of 16. Return only the number, no explanation.` - Expected: `4` - Actual: `4` -- Duration: 892ms (0.89s) -- Timestamp: 4/3/2025, 7:10:15 PM +- Duration: 793ms (0.79s) +- Timestamp: 4/4/2025, 2:38:51 PM ### square_root - deepseek/deepseek-r1-distill-qwen-14b:free - Prompt: `Calculate the square root of 16. Return only the number, no explanation.` - Expected: `4` - Actual: `4` -- Duration: 1755ms (1.75s) -- Timestamp: 4/3/2025, 7:10:17 PM +- Duration: 16332ms (16.33s) +- Timestamp: 4/4/2025, 2:39:08 PM ### square_root - openai/gpt-4o-mini - Prompt: `Calculate the square root of 16. Return only the number, no explanation.` - Expected: `4` - Actual: `4` -- Duration: 828ms (0.83s) -- Timestamp: 4/3/2025, 7:10:17 PM +- Duration: 1012ms (1.01s) +- Timestamp: 4/4/2025, 2:39:09 PM + +### square_root - openrouter/quasar-alpha + +- Prompt: `Calculate the square root of 16. Return only the number, no explanation.` +- Expected: `4` +- Actual: `4` +- Duration: 1535ms (1.53s) +- Timestamp: 4/4/2025, 2:39:10 PM ### power - openai/gpt-3.5-turbo - Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 795ms (0.80s) -- Timestamp: 4/3/2025, 7:10:18 PM +- Duration: 922ms (0.92s) +- Timestamp: 4/4/2025, 2:39:11 PM ### power - deepseek/deepseek-r1-distill-qwen-14b:free - Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 7263ms (7.26s) -- Timestamp: 4/3/2025, 7:10:25 PM +- Duration: 7091ms (7.09s) +- Timestamp: 4/4/2025, 2:39:18 PM ### power - openai/gpt-4o-mini - Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` - Expected: `8` - Actual: `8` -- Duration: 966ms (0.97s) -- Timestamp: 4/3/2025, 7:10:26 PM +- Duration: 1004ms (1.00s) +- Timestamp: 4/4/2025, 2:39:19 PM + +### power - openrouter/quasar-alpha + +- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.` +- Expected: `8` +- Actual: `8` +- Duration: 1567ms (1.57s) +- Timestamp: 4/4/2025, 2:39:21 PM