tests: quasar - alpha

This commit is contained in:
lovebird 2025-04-04 14:44:04 +02:00
parent 214eed04f5
commit ddf8dbce1f
10 changed files with 1393 additions and 238 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,9 @@
{
"model": "openai/gpt-4o-mini",
"model": "openrouter/quasar-alpha",
"messages": [
{
"role": "user",
"content": "divide 15 by 3. Return only the number, no explanation."
"content": "Calculate 2 raised to the power of 3. Return only the number, no explanation."
},
{
"role": "user",

View File

@ -23,6 +23,7 @@
"test:seo": "vitest run tests/unit/seo.test.ts",
"test:language": "vitest run tests/unit/language.test.ts",
"test:tools": "vitest run tests/unit/tools.test.ts",
"test:coding": "vitest run tests/unit/coding.test.ts",
"test2:watch": "vitest",
"test2:coverage": "vitest run --coverage",
"webpack": "webpack --config webpack.config.js --stats-error-details",
@ -49,6 +50,7 @@
"p-map": "7.0.3",
"ts-retry": "6.0.0",
"tslog": "^4.9.3",
"vm2": "^3.9.19",
"yargs": "17.7.2",
"zod": "3.24.2"
},

View File

@ -0,0 +1,122 @@
import { describe, it, expect } from 'vitest'
import * as path from 'node:path'
import { sync as exists } from "@polymech/fs/exists"
import { sync as read } from "@polymech/fs/read"
import { sync as write } from "@polymech/fs/write"
import { sync as mkdirp } from "mkdirp"
import { VM } from 'vm2'
import {
getDefaultModels,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
runTest,
generateTestReport,
getReportPaths
} from './commons'
// Optionally override models for this specific test file
const models = getDefaultModels()
// Ensure test-data/code directory exists
const TEST_CODE_DIR = path.resolve(__dirname, '../test-data/code')
if (exists(TEST_CODE_DIR) !== 'directory') {
mkdirp(TEST_CODE_DIR)
}
describe('Coding Capabilities', () => {
let testResults: TestResult[] = []
const TEST_LOG_PATH = getReportPaths('coding', 'json')
const TEST_REPORT_PATH = getReportPaths('coding', 'md')
const executeCode = (code: string, context: any = {}): any => {
const vm = new VM({
timeout: 1000,
sandbox: context
})
return vm.run(code)
}
it.each(models)('should generate and execute a simple function with model %s', async (modelName) => {
const prompt = `Generate a JavaScript function that adds two numbers and returns the result.
The function should be named 'add' and take two parameters 'a' and 'b'.
Return only the function code, no explanation.`
const result = await runTest(
prompt,
'function add(a, b) { return a + b; }',
'simple_function',
modelName,
TEST_LOG_PATH
)
testResults.push(result)
// Save the code to a file
const codePath = path.resolve(TEST_CODE_DIR, 'add.js')
write(codePath, result.result[0])
// Execute the code
const context = {}
const addFunction = executeCode(result.result[0], context)
expect(addFunction(5, 3)).toBe(8)
}, { timeout: TEST_TIMEOUT })
it.each(models)('should generate and execute a factorial function with model %s', async (modelName) => {
const prompt = `Generate a JavaScript function that calculates the factorial of a number.
The function should be named 'factorial' and take one parameter 'n'.
Return only the function code, no explanation.`
const result = await runTest(
prompt,
'function factorial(n) { return n <= 1 ? 1 : n * factorial(n - 1); }',
'factorial_function',
modelName,
TEST_LOG_PATH
)
testResults.push(result)
// Save the code to a file
const codePath = path.resolve(TEST_CODE_DIR, 'factorial.js')
write(codePath, result.result[0])
// Execute the code
const context = {}
const factorialFunction = executeCode(result.result[0], context)
expect(factorialFunction(5)).toBe(120)
}, { timeout: TEST_TIMEOUT })
it.each(models)('should generate and execute a fibonacci function with model %s', async (modelName) => {
const prompt = `Generate a JavaScript function that calculates the nth Fibonacci number.
The function should be named 'fibonacci' and take one parameter 'n'.
Return only the function code, no explanation.`
const result = await runTest(
prompt,
'function fibonacci(n) { return n <= 1 ? n : fibonacci(n - 1) + fibonacci(n - 2); }',
'fibonacci_function',
modelName,
TEST_LOG_PATH
)
testResults.push(result)
// Save the code to a file
const codePath = path.resolve(TEST_CODE_DIR, 'fibonacci.js')
write(codePath, result.result[0])
// Execute the code
const context = {}
const fibonacciFunction = executeCode(result.result[0], context)
expect(fibonacciFunction(6)).toBe(8)
}, { timeout: TEST_TIMEOUT })
it('should generate markdown report', () => {
generateTestReport(testResults, 'Coding Capabilities Test Results', TEST_REPORT_PATH)
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
})
})

View File

@ -17,7 +17,8 @@ export const getFastModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA
]
}

View File

@ -7176,6 +7176,457 @@
"passed": true,
"duration": 954,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:36:55.754Z",
"passed": true,
"duration": 1505,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:36:59.232Z",
"passed": true,
"duration": 3470,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:37:00.080Z",
"passed": true,
"duration": 842,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:37:00.897Z",
"passed": true,
"duration": 811,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:37:01.784Z",
"passed": true,
"duration": 881,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [],
"expected": "24",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:37:03.117Z",
"passed": false,
"duration": 1327,
"reason": "Model returned empty response",
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:37:04.222Z",
"passed": true,
"duration": 1096,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:37:05.008Z",
"passed": true,
"duration": 780,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:37:05.799Z",
"passed": true,
"duration": 784,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:37:10.272Z",
"passed": true,
"duration": 4467,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:37:11.255Z",
"passed": true,
"duration": 975,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:37:11.993Z",
"passed": true,
"duration": 731,
"category": "basic"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-2,-3",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:12.580Z",
"passed": true,
"duration": 1229,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nAnswer: -3, -2"
],
"expected": "-2,-3",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:38:24.221Z",
"passed": false,
"duration": 11633,
"reason": "Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nanswer: -3, -2",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-2,-3",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:38:25.175Z",
"passed": false,
"duration": 943,
"reason": "Expected -2,-3, but got -2, -3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-2,-3",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:38:26.290Z",
"passed": true,
"duration": 1105,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:27.138Z",
"passed": true,
"duration": 838,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:38:34.971Z",
"passed": true,
"duration": 7825,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:38:35.899Z",
"passed": true,
"duration": 920,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:38:36.748Z",
"passed": true,
"duration": 840,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:37.951Z",
"passed": true,
"duration": 1195,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:38:49.318Z",
"passed": false,
"duration": 11358,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:38:50.264Z",
"passed": false,
"duration": 935,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:38:50.973Z",
"passed": true,
"duration": 701,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:51.774Z",
"passed": true,
"duration": 793,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:39:08.114Z",
"passed": true,
"duration": 16332,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:39:09.133Z",
"passed": true,
"duration": 1012,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:39:10.677Z",
"passed": true,
"duration": 1535,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:39:11.607Z",
"passed": true,
"duration": 922,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:39:18.707Z",
"passed": true,
"duration": 7091,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:39:19.719Z",
"passed": true,
"duration": 1004,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:39:21.294Z",
"passed": true,
"duration": 1567,
"category": "math"
}
],
"highscores": [
@ -7184,13 +7635,13 @@
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 935,
"duration_secs": 0.935
"duration": 943,
"duration_secs": 0.943
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1435,
"duration_secs": 1.435
"model": "openrouter/quasar-alpha",
"duration": 1105,
"duration_secs": 1.105
}
]
},
@ -7198,14 +7649,14 @@
"test": "factorial",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 861,
"duration_secs": 0.861
"model": "openai/gpt-3.5-turbo",
"duration": 838,
"duration_secs": 0.838
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1454,
"duration_secs": 1.454
"model": "openrouter/quasar-alpha",
"duration": 840,
"duration_secs": 0.84
}
]
},
@ -7213,29 +7664,29 @@
"test": "fibonacci",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 756,
"duration_secs": 0.756
"model": "openrouter/quasar-alpha",
"duration": 701,
"duration_secs": 0.701
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 792,
"duration_secs": 0.792
"model": "openai/gpt-4o-mini",
"duration": 935,
"duration_secs": 0.935
}
]
},
{
"test": "square_root",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 793,
"duration_secs": 0.793
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 819,
"duration_secs": 0.819
},
{
"model": "openai/gpt-4o-mini",
"duration": 828,
"duration_secs": 0.828
}
]
},
@ -7244,13 +7695,13 @@
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 795,
"duration_secs": 0.795
"duration": 922,
"duration_secs": 0.922
},
{
"model": "openai/gpt-4o-mini",
"duration": 966,
"duration_secs": 0.966
"duration": 1004,
"duration_secs": 1.004
}
]
},
@ -7258,14 +7709,14 @@
"test": "addition",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 910,
"duration_secs": 0.91
"model": "openrouter/quasar-alpha",
"duration": 811,
"duration_secs": 0.811
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 1484,
"duration_secs": 1.484
"model": "openai/gpt-4o-mini",
"duration": 842,
"duration_secs": 0.842
}
]
},
@ -7273,14 +7724,14 @@
"test": "multiplication",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 955,
"duration_secs": 0.955
"model": "openrouter/quasar-alpha",
"duration": 780,
"duration_secs": 0.78
},
{
"model": "openai/gpt-4o-mini",
"duration": 1095,
"duration_secs": 1.095
"model": "openai/gpt-3.5-turbo",
"duration": 881,
"duration_secs": 0.881
}
]
},
@ -7288,14 +7739,14 @@
"test": "division",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 816,
"duration_secs": 0.816
"model": "openrouter/quasar-alpha",
"duration": 731,
"duration_secs": 0.731
},
{
"model": "qwen/qwq-32b",
"duration": 917,
"duration_secs": 0.917
"model": "openai/gpt-3.5-turbo",
"duration": 784,
"duration_secs": 0.784
}
]
},
@ -7360,5 +7811,5 @@
]
}
],
"lastUpdated": "2025-04-03T17:14:41.955Z"
"lastUpdated": "2025-04-04T12:39:21.300Z"
}

View File

@ -1251,6 +1251,173 @@
"passed": true,
"duration": 954,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:36:55.754Z",
"passed": true,
"duration": 1505,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:36:59.232Z",
"passed": true,
"duration": 3470,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:37:00.080Z",
"passed": true,
"duration": 842,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:37:00.897Z",
"passed": true,
"duration": 811,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:37:01.784Z",
"passed": true,
"duration": 881,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [],
"expected": "24",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:37:03.117Z",
"passed": false,
"duration": 1327,
"reason": "Model returned empty response",
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:37:04.222Z",
"passed": true,
"duration": 1096,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:37:05.008Z",
"passed": true,
"duration": 780,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:37:05.799Z",
"passed": true,
"duration": 784,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:37:10.272Z",
"passed": true,
"duration": 4467,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:37:11.255Z",
"passed": true,
"duration": 975,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:37:11.993Z",
"passed": true,
"duration": 731,
"category": "basic"
}
],
"highscores": [
@ -1258,14 +1425,14 @@
"test": "addition",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 910,
"duration_secs": 0.91
"model": "openrouter/quasar-alpha",
"duration": 811,
"duration_secs": 0.811
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 1484,
"duration_secs": 1.484
"model": "openai/gpt-4o-mini",
"duration": 842,
"duration_secs": 0.842
}
]
},
@ -1273,14 +1440,14 @@
"test": "multiplication",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 955,
"duration_secs": 0.955
"model": "openrouter/quasar-alpha",
"duration": 780,
"duration_secs": 0.78
},
{
"model": "openai/gpt-4o-mini",
"duration": 1095,
"duration_secs": 1.095
"model": "openai/gpt-3.5-turbo",
"duration": 881,
"duration_secs": 0.881
}
]
},
@ -1288,17 +1455,17 @@
"test": "division",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 816,
"duration_secs": 0.816
"model": "openrouter/quasar-alpha",
"duration": 731,
"duration_secs": 0.731
},
{
"model": "qwen/qwq-32b",
"duration": 917,
"duration_secs": 0.917
"model": "openai/gpt-3.5-turbo",
"duration": 784,
"duration_secs": 0.784
}
]
}
],
"lastUpdated": "2025-04-03T17:14:41.951Z"
"lastUpdated": "2025-04-04T12:37:11.994Z"
}

View File

@ -6,38 +6,37 @@
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| addition | openai/gpt-4o-mini | 910 | 0.91 |
| addition | openai/gpt-3.5-turbo | 1484 | 1.48 |
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 8460 | 8.46 |
| multiplication | openai/gpt-3.5-turbo | 955 | 0.95 |
| multiplication | openai/gpt-4o-mini | 1095 | 1.09 |
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 7653 | 7.65 |
| division | openai/gpt-3.5-turbo | 816 | 0.82 |
| division | openai/gpt-4o-mini | 954 | 0.95 |
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 16655 | 16.66 |
| addition | openrouter/quasar-alpha | 811 | 0.81 |
| addition | openai/gpt-4o-mini | 842 | 0.84 |
| addition | openai/gpt-3.5-turbo | 1505 | 1.50 |
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 3470 | 3.47 |
| multiplication | openrouter/quasar-alpha | 780 | 0.78 |
| multiplication | openai/gpt-3.5-turbo | 881 | 0.88 |
| multiplication | openai/gpt-4o-mini | 1096 | 1.10 |
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1327 | 1.33 |
| division | openrouter/quasar-alpha | 731 | 0.73 |
| division | openai/gpt-3.5-turbo | 784 | 0.78 |
| division | openai/gpt-4o-mini | 975 | 0.97 |
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 4467 | 4.47 |
## Summary
- Total Tests: 9
- Passed: 8
- Total Tests: 12
- Passed: 11
- Failed: 1
- Success Rate: 88.89%
- Average Duration: 4331ms (4.33s)
- Success Rate: 91.67%
- Average Duration: 1472ms (1.47s)
## Failed Tests
### division - deepseek/deepseek-r1-distill-qwen-14b:free
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `15 divided by 3 is 5.
Answer: 5`
- Duration: 16655ms (16.66s)
- Reason: Expected 5, but got 15 divided by 3 is 5.
answer: 5
- Timestamp: 4/3/2025, 7:14:40 PM
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: ``
- Duration: 1327ms (1.33s)
- Reason: Model returned empty response
- Timestamp: 4/4/2025, 2:37:03 PM
## Passed Tests
@ -46,62 +45,86 @@ answer: 5
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1484ms (1.48s)
- Timestamp: 4/3/2025, 7:14:04 PM
- Duration: 1505ms (1.50s)
- Timestamp: 4/4/2025, 2:36:55 PM
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 8460ms (8.46s)
- Timestamp: 4/3/2025, 7:14:12 PM
- Duration: 3470ms (3.47s)
- Timestamp: 4/4/2025, 2:36:59 PM
### addition - openai/gpt-4o-mini
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 910ms (0.91s)
- Timestamp: 4/3/2025, 7:14:13 PM
- Duration: 842ms (0.84s)
- Timestamp: 4/4/2025, 2:37:00 PM
### addition - openrouter/quasar-alpha
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 811ms (0.81s)
- Timestamp: 4/4/2025, 2:37:00 PM
### multiplication - openai/gpt-3.5-turbo
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 955ms (0.95s)
- Timestamp: 4/3/2025, 7:14:14 PM
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 7653ms (7.65s)
- Timestamp: 4/3/2025, 7:14:22 PM
- Duration: 881ms (0.88s)
- Timestamp: 4/4/2025, 2:37:01 PM
### multiplication - openai/gpt-4o-mini
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1095ms (1.09s)
- Timestamp: 4/3/2025, 7:14:23 PM
- Duration: 1096ms (1.10s)
- Timestamp: 4/4/2025, 2:37:04 PM
### multiplication - openrouter/quasar-alpha
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 780ms (0.78s)
- Timestamp: 4/4/2025, 2:37:05 PM
### division - openai/gpt-3.5-turbo
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 816ms (0.82s)
- Timestamp: 4/3/2025, 7:14:24 PM
- Duration: 784ms (0.78s)
- Timestamp: 4/4/2025, 2:37:05 PM
### division - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 4467ms (4.47s)
- Timestamp: 4/4/2025, 2:37:10 PM
### division - openai/gpt-4o-mini
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 954ms (0.95s)
- Timestamp: 4/3/2025, 7:14:41 PM
- Duration: 975ms (0.97s)
- Timestamp: 4/4/2025, 2:37:11 PM
### division - openrouter/quasar-alpha
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 731ms (0.73s)
- Timestamp: 4/4/2025, 2:37:11 PM

View File

@ -2828,6 +2828,290 @@
"passed": true,
"duration": 966,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-2,-3",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:12.580Z",
"passed": true,
"duration": 1229,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nAnswer: -3, -2"
],
"expected": "-2,-3",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:38:24.221Z",
"passed": false,
"duration": 11633,
"reason": "Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.\n\nanswer: -3, -2",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-2,-3",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:38:25.175Z",
"passed": false,
"duration": 943,
"reason": "Expected -2,-3, but got -2, -3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-2,-3",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:38:26.290Z",
"passed": true,
"duration": 1105,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:27.138Z",
"passed": true,
"duration": 838,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:38:34.971Z",
"passed": true,
"duration": 7825,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:38:35.899Z",
"passed": true,
"duration": 920,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:38:36.748Z",
"passed": true,
"duration": 840,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:37.951Z",
"passed": true,
"duration": 1195,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:38:49.318Z",
"passed": false,
"duration": 11358,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:38:50.264Z",
"passed": false,
"duration": 935,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:38:50.973Z",
"passed": true,
"duration": 701,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:38:51.774Z",
"passed": true,
"duration": 793,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:39:08.114Z",
"passed": true,
"duration": 16332,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:39:09.133Z",
"passed": true,
"duration": 1012,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:39:10.677Z",
"passed": true,
"duration": 1535,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-04T12:39:11.607Z",
"passed": true,
"duration": 922,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-04T12:39:18.707Z",
"passed": true,
"duration": 7091,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-04T12:39:19.719Z",
"passed": true,
"duration": 1004,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openrouter/quasar-alpha",
"router": "openrouter/quasar-alpha",
"timestamp": "2025-04-04T12:39:21.294Z",
"passed": true,
"duration": 1567,
"category": "math"
}
],
"highscores": [
@ -2836,13 +3120,13 @@
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 935,
"duration_secs": 0.935
"duration": 943,
"duration_secs": 0.943
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1435,
"duration_secs": 1.435
"model": "openrouter/quasar-alpha",
"duration": 1105,
"duration_secs": 1.105
}
]
},
@ -2850,14 +3134,14 @@
"test": "factorial",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 861,
"duration_secs": 0.861
"model": "openai/gpt-3.5-turbo",
"duration": 838,
"duration_secs": 0.838
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1454,
"duration_secs": 1.454
"model": "openrouter/quasar-alpha",
"duration": 840,
"duration_secs": 0.84
}
]
},
@ -2865,29 +3149,29 @@
"test": "fibonacci",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 756,
"duration_secs": 0.756
"model": "openrouter/quasar-alpha",
"duration": 701,
"duration_secs": 0.701
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 792,
"duration_secs": 0.792
"model": "openai/gpt-4o-mini",
"duration": 935,
"duration_secs": 0.935
}
]
},
{
"test": "square_root",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 793,
"duration_secs": 0.793
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 819,
"duration_secs": 0.819
},
{
"model": "openai/gpt-4o-mini",
"duration": 828,
"duration_secs": 0.828
}
]
},
@ -2896,16 +3180,16 @@
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 795,
"duration_secs": 0.795
"duration": 922,
"duration_secs": 0.922
},
{
"model": "openai/gpt-4o-mini",
"duration": 966,
"duration_secs": 0.966
"duration": 1004,
"duration_secs": 1.004
}
]
}
],
"lastUpdated": "2025-04-03T17:10:26.897Z"
"lastUpdated": "2025-04-04T12:39:21.296Z"
}

View File

@ -6,191 +6,204 @@
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| quadratic | openai/gpt-4o-mini | 935 | 0.94 |
| quadratic | openai/gpt-3.5-turbo | 1685 | 1.69 |
| quadratic | deepseek/deepseek-r1-distill-qwen-14b:free | 10827 | 10.83 |
| factorial | openai/gpt-4o-mini | 861 | 0.86 |
| factorial | openai/gpt-3.5-turbo | 3991 | 3.99 |
| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 9116 | 9.12 |
| fibonacci | openai/gpt-4o-mini | 756 | 0.76 |
| fibonacci | openai/gpt-3.5-turbo | 792 | 0.79 |
| fibonacci | deepseek/deepseek-r1-distill-qwen-14b:free | 8292 | 8.29 |
| square_root | openai/gpt-4o-mini | 828 | 0.83 |
| square_root | openai/gpt-3.5-turbo | 892 | 0.89 |
| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 1755 | 1.75 |
| power | openai/gpt-3.5-turbo | 795 | 0.80 |
| power | openai/gpt-4o-mini | 966 | 0.97 |
| power | deepseek/deepseek-r1-distill-qwen-14b:free | 7263 | 7.26 |
| quadratic | openai/gpt-4o-mini | 943 | 0.94 |
| quadratic | openrouter/quasar-alpha | 1105 | 1.10 |
| quadratic | openai/gpt-3.5-turbo | 1229 | 1.23 |
| quadratic | deepseek/deepseek-r1-distill-qwen-14b:free | 11633 | 11.63 |
| factorial | openai/gpt-3.5-turbo | 838 | 0.84 |
| factorial | openrouter/quasar-alpha | 840 | 0.84 |
| factorial | openai/gpt-4o-mini | 920 | 0.92 |
| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 7825 | 7.83 |
| fibonacci | openrouter/quasar-alpha | 701 | 0.70 |
| fibonacci | openai/gpt-4o-mini | 935 | 0.94 |
| fibonacci | openai/gpt-3.5-turbo | 1195 | 1.20 |
| fibonacci | deepseek/deepseek-r1-distill-qwen-14b:free | 11358 | 11.36 |
| square_root | openai/gpt-3.5-turbo | 793 | 0.79 |
| square_root | openai/gpt-4o-mini | 1012 | 1.01 |
| square_root | openrouter/quasar-alpha | 1535 | 1.53 |
| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 16332 | 16.33 |
| power | openai/gpt-3.5-turbo | 922 | 0.92 |
| power | openai/gpt-4o-mini | 1004 | 1.00 |
| power | openrouter/quasar-alpha | 1567 | 1.57 |
| power | deepseek/deepseek-r1-distill-qwen-14b:free | 7091 | 7.09 |
## Summary
- Total Tests: 15
- Passed: 9
- Failed: 6
- Success Rate: 60.00%
- Average Duration: 3317ms (3.32s)
- Total Tests: 20
- Passed: 16
- Failed: 4
- Success Rate: 80.00%
- Average Duration: 3489ms (3.49s)
## Failed Tests
### quadratic - openai/gpt-3.5-turbo
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-2,-3`
- Actual: `-3, -2`
- Duration: 1685ms (1.69s)
- Reason: Expected -2,-3, but got -3, -2
- Timestamp: 4/3/2025, 7:09:38 PM
### quadratic - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-2,-3`
- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are -2 and -3.
- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.
Specific steps:
Answer: -3, -2`
- Duration: 11633ms (11.63s)
- Reason: Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -3, -2.
1. **Identify coefficients**: a = 1, b = 5, c = 6.
2. **Calculate discriminant**: b² - 4ac = 25 - 24 = 1.
3. **Apply quadratic formula**: x = (-5 ± √1)/2.
4. **Compute solutions**: x = (-5 + 1)/2 = -2 and x = (-5 - 1)/2 = -3.
-2, -3
Answer: -2,-3`
- Duration: 10827ms (10.83s)
- Reason: Expected -2,-3, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are -2 and -3.
specific steps:
1. **identify coefficients**: a = 1, b = 5, c = 6.
2. **calculate discriminant**: b² - 4ac = 25 - 24 = 1.
3. **apply quadratic formula**: x = (-5 ± √1)/2.
4. **compute solutions**: x = (-5 + 1)/2 = -2 and x = (-5 - 1)/2 = -3.
-2, -3
answer: -2,-3
- Timestamp: 4/3/2025, 7:09:49 PM
answer: -3, -2
- Timestamp: 4/4/2025, 2:38:24 PM
### quadratic - openai/gpt-4o-mini
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-2,-3`
- Actual: `-2, -3`
- Duration: 935ms (0.94s)
- Duration: 943ms (0.94s)
- Reason: Expected -2,-3, but got -2, -3
- Timestamp: 4/3/2025, 7:09:50 PM
### factorial - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `5! = 120
The factorial of 5 is calculated as:
5 × 4 × 3 × 2 × 1 = 120
**Answer:** 120`
- Duration: 9116ms (9.12s)
- Reason: Expected 120, but got 5! = 120
the factorial of 5 is calculated as:
5 × 4 × 3 × 2 × 1 = 120
**answer:** 120
- Timestamp: 4/3/2025, 7:10:03 PM
- Timestamp: 4/4/2025, 2:38:25 PM
### fibonacci - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `5`
- Duration: 8292ms (8.29s)
- Duration: 11358ms (11.36s)
- Reason: Expected 8, but got 5
- Timestamp: 4/3/2025, 7:10:13 PM
- Timestamp: 4/4/2025, 2:38:49 PM
### fibonacci - openai/gpt-4o-mini
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `5`
- Duration: 756ms (0.76s)
- Duration: 935ms (0.94s)
- Reason: Expected 8, but got 5
- Timestamp: 4/3/2025, 7:10:14 PM
- Timestamp: 4/4/2025, 2:38:50 PM
## Passed Tests
### quadratic - openai/gpt-3.5-turbo
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-2,-3`
- Actual: `-2,-3`
- Duration: 1229ms (1.23s)
- Timestamp: 4/4/2025, 2:38:12 PM
### quadratic - openrouter/quasar-alpha
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-2,-3`
- Actual: `-2,-3`
- Duration: 1105ms (1.10s)
- Timestamp: 4/4/2025, 2:38:26 PM
### factorial - openai/gpt-3.5-turbo
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 3991ms (3.99s)
- Timestamp: 4/3/2025, 7:09:54 PM
- Duration: 838ms (0.84s)
- Timestamp: 4/4/2025, 2:38:27 PM
### factorial - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 7825ms (7.83s)
- Timestamp: 4/4/2025, 2:38:34 PM
### factorial - openai/gpt-4o-mini
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 861ms (0.86s)
- Timestamp: 4/3/2025, 7:10:04 PM
- Duration: 920ms (0.92s)
- Timestamp: 4/4/2025, 2:38:35 PM
### factorial - openrouter/quasar-alpha
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 840ms (0.84s)
- Timestamp: 4/4/2025, 2:38:36 PM
### fibonacci - openai/gpt-3.5-turbo
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 792ms (0.79s)
- Timestamp: 4/3/2025, 7:10:05 PM
- Duration: 1195ms (1.20s)
- Timestamp: 4/4/2025, 2:38:37 PM
### fibonacci - openrouter/quasar-alpha
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 701ms (0.70s)
- Timestamp: 4/4/2025, 2:38:50 PM
### square_root - openai/gpt-3.5-turbo
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 892ms (0.89s)
- Timestamp: 4/3/2025, 7:10:15 PM
- Duration: 793ms (0.79s)
- Timestamp: 4/4/2025, 2:38:51 PM
### square_root - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 1755ms (1.75s)
- Timestamp: 4/3/2025, 7:10:17 PM
- Duration: 16332ms (16.33s)
- Timestamp: 4/4/2025, 2:39:08 PM
### square_root - openai/gpt-4o-mini
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 828ms (0.83s)
- Timestamp: 4/3/2025, 7:10:17 PM
- Duration: 1012ms (1.01s)
- Timestamp: 4/4/2025, 2:39:09 PM
### square_root - openrouter/quasar-alpha
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 1535ms (1.53s)
- Timestamp: 4/4/2025, 2:39:10 PM
### power - openai/gpt-3.5-turbo
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 795ms (0.80s)
- Timestamp: 4/3/2025, 7:10:18 PM
- Duration: 922ms (0.92s)
- Timestamp: 4/4/2025, 2:39:11 PM
### power - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 7263ms (7.26s)
- Timestamp: 4/3/2025, 7:10:25 PM
- Duration: 7091ms (7.09s)
- Timestamp: 4/4/2025, 2:39:18 PM
### power - openai/gpt-4o-mini
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 966ms (0.97s)
- Timestamp: 4/3/2025, 7:10:26 PM
- Duration: 1004ms (1.00s)
- Timestamp: 4/4/2025, 2:39:19 PM
### power - openrouter/quasar-alpha
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1567ms (1.57s)
- Timestamp: 4/4/2025, 2:39:21 PM