tests: quasar - alpha
This commit is contained in:
parent
214eed04f5
commit
ddf8dbce1f
92
packages/kbot/dist/0c550cfc34328e29d9df.js
vendored
Normal file
92
packages/kbot/dist/0c550cfc34328e29d9df.js
vendored
Normal file
File diff suppressed because one or more lines are too long
@ -1,9 +1,9 @@
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "divide 15 by 3. Return only the number, no explanation."
|
||||
"content": "Calculate 2 raised to the power of 3. Return only the number, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
"test:seo": "vitest run tests/unit/seo.test.ts",
|
||||
"test:language": "vitest run tests/unit/language.test.ts",
|
||||
"test:tools": "vitest run tests/unit/tools.test.ts",
|
||||
"test:coding": "vitest run tests/unit/coding.test.ts",
|
||||
"test2:watch": "vitest",
|
||||
"test2:coverage": "vitest run --coverage",
|
||||
"webpack": "webpack --config webpack.config.js --stats-error-details",
|
||||
@ -49,6 +50,7 @@
|
||||
"p-map": "7.0.3",
|
||||
"ts-retry": "6.0.0",
|
||||
"tslog": "^4.9.3",
|
||||
"vm2": "^3.9.19",
|
||||
"yargs": "17.7.2",
|
||||
"zod": "3.24.2"
|
||||
},
|
||||
|
||||
122
packages/kbot/tests/unit/coding.test.ts
Normal file
122
packages/kbot/tests/unit/coding.test.ts
Normal file
@ -0,0 +1,122 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import * as path from 'node:path'
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as mkdirp } from "mkdirp"
|
||||
import { VM } from 'vm2'
|
||||
|
||||
import {
|
||||
getDefaultModels,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
runTest,
|
||||
generateTestReport,
|
||||
getReportPaths
|
||||
} from './commons'
|
||||
|
||||
// Optionally override models for this specific test file
|
||||
const models = getDefaultModels()
|
||||
|
||||
// Ensure test-data/code directory exists
|
||||
const TEST_CODE_DIR = path.resolve(__dirname, '../test-data/code')
|
||||
if (exists(TEST_CODE_DIR) !== 'directory') {
|
||||
mkdirp(TEST_CODE_DIR)
|
||||
}
|
||||
|
||||
describe('Coding Capabilities', () => {
|
||||
let testResults: TestResult[] = []
|
||||
const TEST_LOG_PATH = getReportPaths('coding', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('coding', 'md')
|
||||
|
||||
const executeCode = (code: string, context: any = {}): any => {
|
||||
const vm = new VM({
|
||||
timeout: 1000,
|
||||
sandbox: context
|
||||
})
|
||||
return vm.run(code)
|
||||
}
|
||||
|
||||
it.each(models)('should generate and execute a simple function with model %s', async (modelName) => {
|
||||
const prompt = `Generate a JavaScript function that adds two numbers and returns the result.
|
||||
The function should be named 'add' and take two parameters 'a' and 'b'.
|
||||
Return only the function code, no explanation.`
|
||||
|
||||
const result = await runTest(
|
||||
prompt,
|
||||
'function add(a, b) { return a + b; }',
|
||||
'simple_function',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
|
||||
testResults.push(result)
|
||||
|
||||
// Save the code to a file
|
||||
const codePath = path.resolve(TEST_CODE_DIR, 'add.js')
|
||||
write(codePath, result.result[0])
|
||||
|
||||
// Execute the code
|
||||
const context = {}
|
||||
const addFunction = executeCode(result.result[0], context)
|
||||
expect(addFunction(5, 3)).toBe(8)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should generate and execute a factorial function with model %s', async (modelName) => {
|
||||
const prompt = `Generate a JavaScript function that calculates the factorial of a number.
|
||||
The function should be named 'factorial' and take one parameter 'n'.
|
||||
Return only the function code, no explanation.`
|
||||
|
||||
const result = await runTest(
|
||||
prompt,
|
||||
'function factorial(n) { return n <= 1 ? 1 : n * factorial(n - 1); }',
|
||||
'factorial_function',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
|
||||
testResults.push(result)
|
||||
|
||||
// Save the code to a file
|
||||
const codePath = path.resolve(TEST_CODE_DIR, 'factorial.js')
|
||||
write(codePath, result.result[0])
|
||||
|
||||
// Execute the code
|
||||
const context = {}
|
||||
const factorialFunction = executeCode(result.result[0], context)
|
||||
expect(factorialFunction(5)).toBe(120)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should generate and execute a fibonacci function with model %s', async (modelName) => {
|
||||
const prompt = `Generate a JavaScript function that calculates the nth Fibonacci number.
|
||||
The function should be named 'fibonacci' and take one parameter 'n'.
|
||||
Return only the function code, no explanation.`
|
||||
|
||||
const result = await runTest(
|
||||
prompt,
|
||||
'function fibonacci(n) { return n <= 1 ? n : fibonacci(n - 1) + fibonacci(n - 2); }',
|
||||
'fibonacci_function',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
|
||||
testResults.push(result)
|
||||
|
||||
// Save the code to a file
|
||||
const codePath = path.resolve(TEST_CODE_DIR, 'fibonacci.js')
|
||||
write(codePath, result.result[0])
|
||||
|
||||
// Execute the code
|
||||
const context = {}
|
||||
const fibonacciFunction = executeCode(result.result[0], context)
|
||||
expect(fibonacciFunction(6)).toBe(8)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
generateTestReport(testResults, 'Coding Capabilities Test Results', TEST_REPORT_PATH)
|
||||
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -17,7 +17,8 @@ export const getFastModels = (): string[] => {
|
||||
return [
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
|
||||
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE,
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
||||
E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@ -7176,6 +7176,457 @@
|
||||
"passed": true,
|
||||
"duration": 954,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:36:55.754Z",
|
||||
"passed": true,
|
||||
"duration": 1505,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:36:59.232Z",
|
||||
"passed": true,
|
||||
"duration": 3470,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:37:00.080Z",
|
||||
"passed": true,
|
||||
"duration": 842,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:37:00.897Z",
|
||||
"passed": true,
|
||||
"duration": 811,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:37:01.784Z",
|
||||
"passed": true,
|
||||
"duration": 881,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:37:03.117Z",
|
||||
"passed": false,
|
||||
"duration": 1327,
|
||||
"reason": "Model returned empty response",
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:37:04.222Z",
|
||||
"passed": true,
|
||||
"duration": 1096,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:37:05.008Z",
|
||||
"passed": true,
|
||||
"duration": 780,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:37:05.799Z",
|
||||
"passed": true,
|
||||
"duration": 784,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:37:10.272Z",
|
||||
"passed": true,
|
||||
"duration": 4467,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:37:11.255Z",
|
||||
"passed": true,
|
||||
"duration": 975,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:37:11.993Z",
|
||||
"passed": true,
|
||||
"duration": 731,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:12.580Z",
|
||||
"passed": true,
|
||||
"duration": 1229,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nAnswer: -3, -2"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:38:24.221Z",
|
||||
"passed": false,
|
||||
"duration": 11633,
|
||||
"reason": "Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nanswer: -3, -2",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:38:25.175Z",
|
||||
"passed": false,
|
||||
"duration": 943,
|
||||
"reason": "Expected -2,-3, but got -2, -3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:38:26.290Z",
|
||||
"passed": true,
|
||||
"duration": 1105,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:27.138Z",
|
||||
"passed": true,
|
||||
"duration": 838,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:38:34.971Z",
|
||||
"passed": true,
|
||||
"duration": 7825,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:38:35.899Z",
|
||||
"passed": true,
|
||||
"duration": 920,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:38:36.748Z",
|
||||
"passed": true,
|
||||
"duration": 840,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:37.951Z",
|
||||
"passed": true,
|
||||
"duration": 1195,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:38:49.318Z",
|
||||
"passed": false,
|
||||
"duration": 11358,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:38:50.264Z",
|
||||
"passed": false,
|
||||
"duration": 935,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:38:50.973Z",
|
||||
"passed": true,
|
||||
"duration": 701,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:51.774Z",
|
||||
"passed": true,
|
||||
"duration": 793,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:39:08.114Z",
|
||||
"passed": true,
|
||||
"duration": 16332,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:39:09.133Z",
|
||||
"passed": true,
|
||||
"duration": 1012,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:39:10.677Z",
|
||||
"passed": true,
|
||||
"duration": 1535,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:39:11.607Z",
|
||||
"passed": true,
|
||||
"duration": 922,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:39:18.707Z",
|
||||
"passed": true,
|
||||
"duration": 7091,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:39:19.719Z",
|
||||
"passed": true,
|
||||
"duration": 1004,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:39:21.294Z",
|
||||
"passed": true,
|
||||
"duration": 1567,
|
||||
"category": "math"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -7184,13 +7635,13 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 935,
|
||||
"duration_secs": 0.935
|
||||
"duration": 943,
|
||||
"duration_secs": 0.943
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1435,
|
||||
"duration_secs": 1.435
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 1105,
|
||||
"duration_secs": 1.105
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7198,14 +7649,14 @@
|
||||
"test": "factorial",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 861,
|
||||
"duration_secs": 0.861
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 838,
|
||||
"duration_secs": 0.838
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1454,
|
||||
"duration_secs": 1.454
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 840,
|
||||
"duration_secs": 0.84
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7213,29 +7664,29 @@
|
||||
"test": "fibonacci",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 756,
|
||||
"duration_secs": 0.756
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 701,
|
||||
"duration_secs": 0.701
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 792,
|
||||
"duration_secs": 0.792
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 935,
|
||||
"duration_secs": 0.935
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 793,
|
||||
"duration_secs": 0.793
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 819,
|
||||
"duration_secs": 0.819
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 828,
|
||||
"duration_secs": 0.828
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7244,13 +7695,13 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 795,
|
||||
"duration_secs": 0.795
|
||||
"duration": 922,
|
||||
"duration_secs": 0.922
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 966,
|
||||
"duration_secs": 0.966
|
||||
"duration": 1004,
|
||||
"duration_secs": 1.004
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7258,14 +7709,14 @@
|
||||
"test": "addition",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 910,
|
||||
"duration_secs": 0.91
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 811,
|
||||
"duration_secs": 0.811
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 1484,
|
||||
"duration_secs": 1.484
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 842,
|
||||
"duration_secs": 0.842
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7273,14 +7724,14 @@
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 955,
|
||||
"duration_secs": 0.955
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 780,
|
||||
"duration_secs": 0.78
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1095,
|
||||
"duration_secs": 1.095
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 881,
|
||||
"duration_secs": 0.881
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7288,14 +7739,14 @@
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 816,
|
||||
"duration_secs": 0.816
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 731,
|
||||
"duration_secs": 0.731
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 917,
|
||||
"duration_secs": 0.917
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 784,
|
||||
"duration_secs": 0.784
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -7360,5 +7811,5 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-03T17:14:41.955Z"
|
||||
"lastUpdated": "2025-04-04T12:39:21.300Z"
|
||||
}
|
||||
@ -1251,6 +1251,173 @@
|
||||
"passed": true,
|
||||
"duration": 954,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:36:55.754Z",
|
||||
"passed": true,
|
||||
"duration": 1505,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:36:59.232Z",
|
||||
"passed": true,
|
||||
"duration": 3470,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:37:00.080Z",
|
||||
"passed": true,
|
||||
"duration": 842,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:37:00.897Z",
|
||||
"passed": true,
|
||||
"duration": 811,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:37:01.784Z",
|
||||
"passed": true,
|
||||
"duration": 881,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:37:03.117Z",
|
||||
"passed": false,
|
||||
"duration": 1327,
|
||||
"reason": "Model returned empty response",
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:37:04.222Z",
|
||||
"passed": true,
|
||||
"duration": 1096,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:37:05.008Z",
|
||||
"passed": true,
|
||||
"duration": 780,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:37:05.799Z",
|
||||
"passed": true,
|
||||
"duration": 784,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:37:10.272Z",
|
||||
"passed": true,
|
||||
"duration": 4467,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:37:11.255Z",
|
||||
"passed": true,
|
||||
"duration": 975,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:37:11.993Z",
|
||||
"passed": true,
|
||||
"duration": 731,
|
||||
"category": "basic"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -1258,14 +1425,14 @@
|
||||
"test": "addition",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 910,
|
||||
"duration_secs": 0.91
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 811,
|
||||
"duration_secs": 0.811
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 1484,
|
||||
"duration_secs": 1.484
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 842,
|
||||
"duration_secs": 0.842
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1273,14 +1440,14 @@
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 955,
|
||||
"duration_secs": 0.955
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 780,
|
||||
"duration_secs": 0.78
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1095,
|
||||
"duration_secs": 1.095
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 881,
|
||||
"duration_secs": 0.881
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1288,17 +1455,17 @@
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 816,
|
||||
"duration_secs": 0.816
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 731,
|
||||
"duration_secs": 0.731
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 917,
|
||||
"duration_secs": 0.917
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 784,
|
||||
"duration_secs": 0.784
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-03T17:14:41.951Z"
|
||||
"lastUpdated": "2025-04-04T12:37:11.994Z"
|
||||
}
|
||||
@ -6,38 +6,37 @@
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| addition | openai/gpt-4o-mini | 910 | 0.91 |
|
||||
| addition | openai/gpt-3.5-turbo | 1484 | 1.48 |
|
||||
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 8460 | 8.46 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 955 | 0.95 |
|
||||
| multiplication | openai/gpt-4o-mini | 1095 | 1.09 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 7653 | 7.65 |
|
||||
| division | openai/gpt-3.5-turbo | 816 | 0.82 |
|
||||
| division | openai/gpt-4o-mini | 954 | 0.95 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 16655 | 16.66 |
|
||||
| addition | openrouter/quasar-alpha | 811 | 0.81 |
|
||||
| addition | openai/gpt-4o-mini | 842 | 0.84 |
|
||||
| addition | openai/gpt-3.5-turbo | 1505 | 1.50 |
|
||||
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 3470 | 3.47 |
|
||||
| multiplication | openrouter/quasar-alpha | 780 | 0.78 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 881 | 0.88 |
|
||||
| multiplication | openai/gpt-4o-mini | 1096 | 1.10 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1327 | 1.33 |
|
||||
| division | openrouter/quasar-alpha | 731 | 0.73 |
|
||||
| division | openai/gpt-3.5-turbo | 784 | 0.78 |
|
||||
| division | openai/gpt-4o-mini | 975 | 0.97 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 4467 | 4.47 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 9
|
||||
- Passed: 8
|
||||
- Total Tests: 12
|
||||
- Passed: 11
|
||||
- Failed: 1
|
||||
- Success Rate: 88.89%
|
||||
- Average Duration: 4331ms (4.33s)
|
||||
- Success Rate: 91.67%
|
||||
- Average Duration: 1472ms (1.47s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### division - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `15 divided by 3 is 5.
|
||||
|
||||
Answer: 5`
|
||||
- Duration: 16655ms (16.66s)
|
||||
- Reason: Expected 5, but got 15 divided by 3 is 5.
|
||||
|
||||
answer: 5
|
||||
- Timestamp: 4/3/2025, 7:14:40 PM
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: ``
|
||||
- Duration: 1327ms (1.33s)
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/4/2025, 2:37:03 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
@ -46,62 +45,86 @@ answer: 5
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1484ms (1.48s)
|
||||
- Timestamp: 4/3/2025, 7:14:04 PM
|
||||
- Duration: 1505ms (1.50s)
|
||||
- Timestamp: 4/4/2025, 2:36:55 PM
|
||||
|
||||
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 8460ms (8.46s)
|
||||
- Timestamp: 4/3/2025, 7:14:12 PM
|
||||
- Duration: 3470ms (3.47s)
|
||||
- Timestamp: 4/4/2025, 2:36:59 PM
|
||||
|
||||
### addition - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 910ms (0.91s)
|
||||
- Timestamp: 4/3/2025, 7:14:13 PM
|
||||
- Duration: 842ms (0.84s)
|
||||
- Timestamp: 4/4/2025, 2:37:00 PM
|
||||
|
||||
### addition - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 811ms (0.81s)
|
||||
- Timestamp: 4/4/2025, 2:37:00 PM
|
||||
|
||||
### multiplication - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 955ms (0.95s)
|
||||
- Timestamp: 4/3/2025, 7:14:14 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 7653ms (7.65s)
|
||||
- Timestamp: 4/3/2025, 7:14:22 PM
|
||||
- Duration: 881ms (0.88s)
|
||||
- Timestamp: 4/4/2025, 2:37:01 PM
|
||||
|
||||
### multiplication - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1095ms (1.09s)
|
||||
- Timestamp: 4/3/2025, 7:14:23 PM
|
||||
- Duration: 1096ms (1.10s)
|
||||
- Timestamp: 4/4/2025, 2:37:04 PM
|
||||
|
||||
### multiplication - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 780ms (0.78s)
|
||||
- Timestamp: 4/4/2025, 2:37:05 PM
|
||||
|
||||
### division - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 816ms (0.82s)
|
||||
- Timestamp: 4/3/2025, 7:14:24 PM
|
||||
- Duration: 784ms (0.78s)
|
||||
- Timestamp: 4/4/2025, 2:37:05 PM
|
||||
|
||||
### division - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 4467ms (4.47s)
|
||||
- Timestamp: 4/4/2025, 2:37:10 PM
|
||||
|
||||
### division - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 954ms (0.95s)
|
||||
- Timestamp: 4/3/2025, 7:14:41 PM
|
||||
- Duration: 975ms (0.97s)
|
||||
- Timestamp: 4/4/2025, 2:37:11 PM
|
||||
|
||||
### division - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 731ms (0.73s)
|
||||
- Timestamp: 4/4/2025, 2:37:11 PM
|
||||
|
||||
|
||||
@ -2828,6 +2828,290 @@
|
||||
"passed": true,
|
||||
"duration": 966,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:12.580Z",
|
||||
"passed": true,
|
||||
"duration": 1229,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nAnswer: -3, -2"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:38:24.221Z",
|
||||
"passed": false,
|
||||
"duration": 11633,
|
||||
"reason": "Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nanswer: -3, -2",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:38:25.175Z",
|
||||
"passed": false,
|
||||
"duration": 943,
|
||||
"reason": "Expected -2,-3, but got -2, -3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-2,-3",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:38:26.290Z",
|
||||
"passed": true,
|
||||
"duration": 1105,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:27.138Z",
|
||||
"passed": true,
|
||||
"duration": 838,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:38:34.971Z",
|
||||
"passed": true,
|
||||
"duration": 7825,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:38:35.899Z",
|
||||
"passed": true,
|
||||
"duration": 920,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:38:36.748Z",
|
||||
"passed": true,
|
||||
"duration": 840,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:37.951Z",
|
||||
"passed": true,
|
||||
"duration": 1195,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:38:49.318Z",
|
||||
"passed": false,
|
||||
"duration": 11358,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:38:50.264Z",
|
||||
"passed": false,
|
||||
"duration": 935,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:38:50.973Z",
|
||||
"passed": true,
|
||||
"duration": 701,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:38:51.774Z",
|
||||
"passed": true,
|
||||
"duration": 793,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:39:08.114Z",
|
||||
"passed": true,
|
||||
"duration": 16332,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:39:09.133Z",
|
||||
"passed": true,
|
||||
"duration": 1012,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:39:10.677Z",
|
||||
"passed": true,
|
||||
"duration": 1535,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-04T12:39:11.607Z",
|
||||
"passed": true,
|
||||
"duration": 922,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-04T12:39:18.707Z",
|
||||
"passed": true,
|
||||
"duration": 7091,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-04T12:39:19.719Z",
|
||||
"passed": true,
|
||||
"duration": 1004,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"router": "openrouter/quasar-alpha",
|
||||
"timestamp": "2025-04-04T12:39:21.294Z",
|
||||
"passed": true,
|
||||
"duration": 1567,
|
||||
"category": "math"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -2836,13 +3120,13 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 935,
|
||||
"duration_secs": 0.935
|
||||
"duration": 943,
|
||||
"duration_secs": 0.943
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1435,
|
||||
"duration_secs": 1.435
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 1105,
|
||||
"duration_secs": 1.105
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -2850,14 +3134,14 @@
|
||||
"test": "factorial",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 861,
|
||||
"duration_secs": 0.861
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 838,
|
||||
"duration_secs": 0.838
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1454,
|
||||
"duration_secs": 1.454
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 840,
|
||||
"duration_secs": 0.84
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -2865,29 +3149,29 @@
|
||||
"test": "fibonacci",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 756,
|
||||
"duration_secs": 0.756
|
||||
"model": "openrouter/quasar-alpha",
|
||||
"duration": 701,
|
||||
"duration_secs": 0.701
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 792,
|
||||
"duration_secs": 0.792
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 935,
|
||||
"duration_secs": 0.935
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 793,
|
||||
"duration_secs": 0.793
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 819,
|
||||
"duration_secs": 0.819
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 828,
|
||||
"duration_secs": 0.828
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -2896,16 +3180,16 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 795,
|
||||
"duration_secs": 0.795
|
||||
"duration": 922,
|
||||
"duration_secs": 0.922
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 966,
|
||||
"duration_secs": 0.966
|
||||
"duration": 1004,
|
||||
"duration_secs": 1.004
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-03T17:10:26.897Z"
|
||||
"lastUpdated": "2025-04-04T12:39:21.296Z"
|
||||
}
|
||||
@ -6,191 +6,204 @@
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| quadratic | openai/gpt-4o-mini | 935 | 0.94 |
|
||||
| quadratic | openai/gpt-3.5-turbo | 1685 | 1.69 |
|
||||
| quadratic | deepseek/deepseek-r1-distill-qwen-14b:free | 10827 | 10.83 |
|
||||
| factorial | openai/gpt-4o-mini | 861 | 0.86 |
|
||||
| factorial | openai/gpt-3.5-turbo | 3991 | 3.99 |
|
||||
| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 9116 | 9.12 |
|
||||
| fibonacci | openai/gpt-4o-mini | 756 | 0.76 |
|
||||
| fibonacci | openai/gpt-3.5-turbo | 792 | 0.79 |
|
||||
| fibonacci | deepseek/deepseek-r1-distill-qwen-14b:free | 8292 | 8.29 |
|
||||
| square_root | openai/gpt-4o-mini | 828 | 0.83 |
|
||||
| square_root | openai/gpt-3.5-turbo | 892 | 0.89 |
|
||||
| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 1755 | 1.75 |
|
||||
| power | openai/gpt-3.5-turbo | 795 | 0.80 |
|
||||
| power | openai/gpt-4o-mini | 966 | 0.97 |
|
||||
| power | deepseek/deepseek-r1-distill-qwen-14b:free | 7263 | 7.26 |
|
||||
| quadratic | openai/gpt-4o-mini | 943 | 0.94 |
|
||||
| quadratic | openrouter/quasar-alpha | 1105 | 1.10 |
|
||||
| quadratic | openai/gpt-3.5-turbo | 1229 | 1.23 |
|
||||
| quadratic | deepseek/deepseek-r1-distill-qwen-14b:free | 11633 | 11.63 |
|
||||
| factorial | openai/gpt-3.5-turbo | 838 | 0.84 |
|
||||
| factorial | openrouter/quasar-alpha | 840 | 0.84 |
|
||||
| factorial | openai/gpt-4o-mini | 920 | 0.92 |
|
||||
| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 7825 | 7.83 |
|
||||
| fibonacci | openrouter/quasar-alpha | 701 | 0.70 |
|
||||
| fibonacci | openai/gpt-4o-mini | 935 | 0.94 |
|
||||
| fibonacci | openai/gpt-3.5-turbo | 1195 | 1.20 |
|
||||
| fibonacci | deepseek/deepseek-r1-distill-qwen-14b:free | 11358 | 11.36 |
|
||||
| square_root | openai/gpt-3.5-turbo | 793 | 0.79 |
|
||||
| square_root | openai/gpt-4o-mini | 1012 | 1.01 |
|
||||
| square_root | openrouter/quasar-alpha | 1535 | 1.53 |
|
||||
| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 16332 | 16.33 |
|
||||
| power | openai/gpt-3.5-turbo | 922 | 0.92 |
|
||||
| power | openai/gpt-4o-mini | 1004 | 1.00 |
|
||||
| power | openrouter/quasar-alpha | 1567 | 1.57 |
|
||||
| power | deepseek/deepseek-r1-distill-qwen-14b:free | 7091 | 7.09 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 15
|
||||
- Passed: 9
|
||||
- Failed: 6
|
||||
- Success Rate: 60.00%
|
||||
- Average Duration: 3317ms (3.32s)
|
||||
- Total Tests: 20
|
||||
- Passed: 16
|
||||
- Failed: 4
|
||||
- Success Rate: 80.00%
|
||||
- Average Duration: 3489ms (3.49s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### quadratic - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-2,-3`
|
||||
- Actual: `-3, -2`
|
||||
- Duration: 1685ms (1.69s)
|
||||
- Reason: Expected -2,-3, but got -3, -2
|
||||
- Timestamp: 4/3/2025, 7:09:38 PM
|
||||
|
||||
### quadratic - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-2,-3`
|
||||
- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are -2 and -3.
|
||||
- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.
|
||||
|
||||
Specific steps:
|
||||
Answer: -3, -2`
|
||||
- Duration: 11633ms (11.63s)
|
||||
- Reason: Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.
|
||||
|
||||
1. **Identify coefficients**: a = 1, b = 5, c = 6.
|
||||
2. **Calculate discriminant**: b² - 4ac = 25 - 24 = 1.
|
||||
3. **Apply quadratic formula**: x = (-5 ± √1)/2.
|
||||
4. **Compute solutions**: x = (-5 + 1)/2 = -2 and x = (-5 - 1)/2 = -3.
|
||||
|
||||
-2, -3
|
||||
|
||||
Answer: -2,-3`
|
||||
- Duration: 10827ms (10.83s)
|
||||
- Reason: Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -2 and -3.
|
||||
|
||||
specific steps:
|
||||
|
||||
1. **identify coefficients**: a = 1, b = 5, c = 6.
|
||||
2. **calculate discriminant**: b² - 4ac = 25 - 24 = 1.
|
||||
3. **apply quadratic formula**: x = (-5 ± √1)/2.
|
||||
4. **compute solutions**: x = (-5 + 1)/2 = -2 and x = (-5 - 1)/2 = -3.
|
||||
|
||||
-2, -3
|
||||
|
||||
answer: -2,-3
|
||||
- Timestamp: 4/3/2025, 7:09:49 PM
|
||||
answer: -3, -2
|
||||
- Timestamp: 4/4/2025, 2:38:24 PM
|
||||
|
||||
### quadratic - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-2,-3`
|
||||
- Actual: `-2, -3`
|
||||
- Duration: 935ms (0.94s)
|
||||
- Duration: 943ms (0.94s)
|
||||
- Reason: Expected -2,-3, but got -2, -3
|
||||
- Timestamp: 4/3/2025, 7:09:50 PM
|
||||
|
||||
### factorial - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `5! = 120
|
||||
|
||||
The factorial of 5 is calculated as:
|
||||
|
||||
5 × 4 × 3 × 2 × 1 = 120
|
||||
|
||||
**Answer:** 120`
|
||||
- Duration: 9116ms (9.12s)
|
||||
- Reason: Expected 120, but got 5! = 120
|
||||
|
||||
the factorial of 5 is calculated as:
|
||||
|
||||
5 × 4 × 3 × 2 × 1 = 120
|
||||
|
||||
**answer:** 120
|
||||
- Timestamp: 4/3/2025, 7:10:03 PM
|
||||
- Timestamp: 4/4/2025, 2:38:25 PM
|
||||
|
||||
### fibonacci - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `5`
|
||||
- Duration: 8292ms (8.29s)
|
||||
- Duration: 11358ms (11.36s)
|
||||
- Reason: Expected 8, but got 5
|
||||
- Timestamp: 4/3/2025, 7:10:13 PM
|
||||
- Timestamp: 4/4/2025, 2:38:49 PM
|
||||
|
||||
### fibonacci - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `5`
|
||||
- Duration: 756ms (0.76s)
|
||||
- Duration: 935ms (0.94s)
|
||||
- Reason: Expected 8, but got 5
|
||||
- Timestamp: 4/3/2025, 7:10:14 PM
|
||||
- Timestamp: 4/4/2025, 2:38:50 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### quadratic - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-2,-3`
|
||||
- Actual: `-2,-3`
|
||||
- Duration: 1229ms (1.23s)
|
||||
- Timestamp: 4/4/2025, 2:38:12 PM
|
||||
|
||||
### quadratic - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-2,-3`
|
||||
- Actual: `-2,-3`
|
||||
- Duration: 1105ms (1.10s)
|
||||
- Timestamp: 4/4/2025, 2:38:26 PM
|
||||
|
||||
### factorial - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 3991ms (3.99s)
|
||||
- Timestamp: 4/3/2025, 7:09:54 PM
|
||||
- Duration: 838ms (0.84s)
|
||||
- Timestamp: 4/4/2025, 2:38:27 PM
|
||||
|
||||
### factorial - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 7825ms (7.83s)
|
||||
- Timestamp: 4/4/2025, 2:38:34 PM
|
||||
|
||||
### factorial - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 861ms (0.86s)
|
||||
- Timestamp: 4/3/2025, 7:10:04 PM
|
||||
- Duration: 920ms (0.92s)
|
||||
- Timestamp: 4/4/2025, 2:38:35 PM
|
||||
|
||||
### factorial - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 840ms (0.84s)
|
||||
- Timestamp: 4/4/2025, 2:38:36 PM
|
||||
|
||||
### fibonacci - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 792ms (0.79s)
|
||||
- Timestamp: 4/3/2025, 7:10:05 PM
|
||||
- Duration: 1195ms (1.20s)
|
||||
- Timestamp: 4/4/2025, 2:38:37 PM
|
||||
|
||||
### fibonacci - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 701ms (0.70s)
|
||||
- Timestamp: 4/4/2025, 2:38:50 PM
|
||||
|
||||
### square_root - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 892ms (0.89s)
|
||||
- Timestamp: 4/3/2025, 7:10:15 PM
|
||||
- Duration: 793ms (0.79s)
|
||||
- Timestamp: 4/4/2025, 2:38:51 PM
|
||||
|
||||
### square_root - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 1755ms (1.75s)
|
||||
- Timestamp: 4/3/2025, 7:10:17 PM
|
||||
- Duration: 16332ms (16.33s)
|
||||
- Timestamp: 4/4/2025, 2:39:08 PM
|
||||
|
||||
### square_root - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 828ms (0.83s)
|
||||
- Timestamp: 4/3/2025, 7:10:17 PM
|
||||
- Duration: 1012ms (1.01s)
|
||||
- Timestamp: 4/4/2025, 2:39:09 PM
|
||||
|
||||
### square_root - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 1535ms (1.53s)
|
||||
- Timestamp: 4/4/2025, 2:39:10 PM
|
||||
|
||||
### power - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 795ms (0.80s)
|
||||
- Timestamp: 4/3/2025, 7:10:18 PM
|
||||
- Duration: 922ms (0.92s)
|
||||
- Timestamp: 4/4/2025, 2:39:11 PM
|
||||
|
||||
### power - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 7263ms (7.26s)
|
||||
- Timestamp: 4/3/2025, 7:10:25 PM
|
||||
- Duration: 7091ms (7.09s)
|
||||
- Timestamp: 4/4/2025, 2:39:18 PM
|
||||
|
||||
### power - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 966ms (0.97s)
|
||||
- Timestamp: 4/3/2025, 7:10:26 PM
|
||||
- Duration: 1004ms (1.00s)
|
||||
- Timestamp: 4/4/2025, 2:39:19 PM
|
||||
|
||||
### power - openrouter/quasar-alpha
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1567ms (1.57s)
|
||||
- Timestamp: 4/4/2025, 2:39:21 PM
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user