test report commons

This commit is contained in:
lovebird 2025-04-02 13:27:47 +02:00
parent 1c3a2d981e
commit bc644f8635
8 changed files with 2921 additions and 142 deletions

View File

@ -1,5 +1,5 @@
{
"model": "qwen/qwq-32b",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"messages": [
{
"role": "user",

View File

@ -9,7 +9,11 @@ import { sync as mkdirp } from "mkdirp"
export const getDefaultModels = () => [
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE
]
export const isOpenRouterModel = (model: string): boolean => {
@ -51,6 +55,7 @@ export interface TestResult {
details?: any;
};
duration?: number
category?: string;
}
export interface TestHighscore {
@ -108,7 +113,7 @@ export const runTest = async (
logPath: string
): Promise<TestResult> => {
let model = 'unknown'
let router = 'unknown'
let router = 'openrouter'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
@ -124,7 +129,7 @@ export const runTest = async (
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = isOpenRouterModel(model) ? 'openrouter' : 'unknown'
router = options.model as string
return options
}
}),
@ -181,6 +186,13 @@ export const runTest = async (
throw e
} finally {
if (testResult) {
// Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
const category = path.basename(logPath, path.extname(logPath))
// Add category to test result
testResult.category = category
// Update category-specific log
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
const updatedResults = [...(existingData.results || []), testResult]
@ -200,12 +212,40 @@ export const runTest = async (
// Generate highscores
const highscores = generateHighscores(latestResults)
// Write both results and highscores
// Write category-specific results
write(logPath, JSON.stringify({
results: updatedResults,
highscores,
lastUpdated: new Date().toISOString()
}, null, 2))
// Update central all.json log
const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
const allUpdatedResults = [...(allExistingData.results || []), testResult]
// Group all results by test and model
const allLatestResults = new Map<string, Map<string, TestResult>>()
allUpdatedResults.forEach(result => {
if (!allLatestResults.has(result.test)) {
allLatestResults.set(result.test, new Map())
}
const testMap = allLatestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate highscores for all results
const allHighscores = generateHighscores(allLatestResults)
// Write all results
write(allLogPath, JSON.stringify({
results: allUpdatedResults,
highscores: allHighscores,
lastUpdated: new Date().toISOString()
}, null, 2))
}
}
return testResult
@ -236,25 +276,29 @@ export const generateTestReport = (
// Add highscore section
report += '## Highscores\n\n'
for (const [testName, modelResults] of latestResults) {
report += `### ${testName}\n`
// Convert model results to array and sort by duration
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({ model, result }))
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
.slice(0, 2) // Get top 2
// Create a table header
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
report += '|------|-------|--------------|--------------|\n'
// Sort all results by duration
const allResults = Array.from(latestResults.entries())
.flatMap(([testName, modelResults]) =>
Array.from(modelResults.entries())
.map(([model, result]) => ({
test: testName,
model,
duration: result.duration || 0
}))
)
.sort((a, b) => a.duration - b.duration)
if (sortedResults.length > 0) {
sortedResults.forEach(({ model, result }, index) => {
const duration = result.duration || 0
report += `${index + 1}. ${model}: ${duration}ms (${(duration / 1000).toFixed(2)}s)\n`
})
} else {
report += '*No results available*\n'
}
report += '\n'
}
// Add all results to the table
allResults.forEach(({ test, model, duration }) => {
report += `| ${test} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
})
report += '\n'
// Add summary section
report += '## Summary\n\n'
@ -275,7 +319,7 @@ export const generateTestReport = (
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `### ${testName} - ${model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
@ -305,7 +349,7 @@ export const generateTestReport = (
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `### ${testName} - ${model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`

File diff suppressed because it is too large Load Diff

View File

@ -155,6 +155,594 @@
"timestamp": "2025-04-02T11:01:27.791Z",
"passed": true,
"duration": 7852
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:21.370Z",
"passed": true,
"duration": 1213,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:24.898Z",
"passed": true,
"duration": 3524,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:25.624Z",
"passed": true,
"duration": 724,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:26.630Z",
"passed": true,
"duration": 1005,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:27.812Z",
"passed": true,
"duration": 1178,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:31.317Z",
"passed": true,
"duration": 3503,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:32.288Z",
"passed": true,
"duration": 969,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:33.147Z",
"passed": true,
"duration": 858,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:33.724Z",
"passed": true,
"duration": 576,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:34.841Z",
"passed": true,
"duration": 1115,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:35.673Z",
"passed": true,
"duration": 831,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:16:36.762Z",
"passed": true,
"duration": 1087,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:25.749Z",
"passed": true,
"duration": 1644,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:31.261Z",
"passed": true,
"duration": 5507,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:32.131Z",
"passed": true,
"duration": 869,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:33.306Z",
"passed": true,
"duration": 1173,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:34.323Z",
"passed": true,
"duration": 1016,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:38.976Z",
"passed": true,
"duration": 4651,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:39.914Z",
"passed": true,
"duration": 937,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:41.053Z",
"passed": true,
"duration": 1137,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:42.918Z",
"passed": true,
"duration": 1863,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:47.234Z",
"passed": true,
"duration": 4314,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:47.966Z",
"passed": true,
"duration": 730,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:20:48.941Z",
"passed": true,
"duration": 973,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:15.745Z",
"passed": true,
"duration": 1951,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:19.476Z",
"passed": true,
"duration": 3726,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:20.854Z",
"passed": true,
"duration": 1376,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:28.044Z",
"passed": true,
"duration": 7188,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:43.203Z",
"passed": true,
"duration": 15157,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:50.736Z",
"passed": true,
"duration": 7531,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:51.834Z",
"passed": true,
"duration": 1096,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:55.428Z",
"passed": true,
"duration": 3592,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:56.874Z",
"passed": true,
"duration": 1444,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:25:57.746Z",
"passed": true,
"duration": 870,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "deepseek/deepseek-r1",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:08.731Z",
"passed": true,
"duration": 10983,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:14.379Z",
"passed": true,
"duration": 5646,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:15.658Z",
"passed": true,
"duration": 1276,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:21.428Z",
"passed": true,
"duration": 5768,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:22.358Z",
"passed": true,
"duration": 929,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:23.155Z",
"passed": true,
"duration": 794,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:38.566Z",
"passed": true,
"duration": 15409,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "openrouter",
"timestamp": "2025-04-02T11:26:40.358Z",
"passed": true,
"duration": 1790,
"category": "basic"
}
],
"highscores": [
@ -162,14 +750,14 @@
"test": "addition",
"rankings": [
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1888,
"duration_secs": 1.888
"model": "openai/gpt-4o-mini",
"duration": 1376,
"duration_secs": 1.376
},
{
"model": "qwen/qwq-32b",
"duration": 6304,
"duration_secs": 6.304
"model": "anthropic/claude-3.5-sonnet",
"duration": 1951,
"duration_secs": 1.951
}
]
},
@ -177,14 +765,14 @@
"test": "multiplication",
"rankings": [
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1291,
"duration_secs": 1.291
"model": "openai/gpt-3.5-turbo",
"duration": 870,
"duration_secs": 0.87
},
{
"model": "qwen/qwq-32b",
"duration": 2225,
"duration_secs": 2.225
"model": "anthropic/claude-3.5-sonnet",
"duration": 1096,
"duration_secs": 1.096
}
]
},
@ -192,17 +780,17 @@
"test": "division",
"rankings": [
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1209,
"duration_secs": 1.209
"model": "openai/gpt-3.5-turbo",
"duration": 794,
"duration_secs": 0.794
},
{
"model": "qwen/qwq-32b",
"duration": 7852,
"duration_secs": 7.852
"model": "openai/gpt-4o-mini",
"duration": 929,
"duration_secs": 0.929
}
]
}
],
"lastUpdated": "2025-04-02T11:01:27.792Z"
"lastUpdated": "2025-04-02T11:26:40.358Z"
}

View File

@ -1,72 +0,0 @@
# Basic Operations Test Results
## Highscores
### addition
1. anthropic/claude-3.5-sonnet: 1237ms (1.24s)
2. qwen/qwq-32b: 4298ms (4.30s)
### multiplication
1. anthropic/claude-3.5-sonnet: 1411ms (1.41s)
2. qwen/qwq-32b: 3122ms (3.12s)
### division
1. anthropic/claude-3.5-sonnet: 583ms (0.58s)
2. qwen/qwq-32b: 3615ms (3.62s)
## Summary
- Total Tests: 6
- Passed: 6
- Failed: 0
- Success Rate: 100.00%
- Average Duration: 2378ms (2.38s)
## Failed Tests
*No failed tests*
## Passed Tests
### addition - anthropic/claude-3.5-sonnet
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1237ms (1237.00s)
- Timestamp: 4/2/2025, 12:56:09 PM
### addition - qwen/qwq-32b
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 4298ms (4298.00s)
- Timestamp: 4/2/2025, 12:56:13 PM
### multiplication - anthropic/claude-3.5-sonnet
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1411ms (1411.00s)
- Timestamp: 4/2/2025, 12:56:15 PM
### multiplication - qwen/qwq-32b
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 3122ms (3122.00s)
- Timestamp: 4/2/2025, 12:56:18 PM
### division - anthropic/claude-3.5-sonnet
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 583ms (583.00s)
- Timestamp: 4/2/2025, 12:56:18 PM
### division - qwen/qwq-32b
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 3615ms (3615.00s)
- Timestamp: 4/2/2025, 12:56:22 PM

View File

@ -2,25 +2,34 @@
## Highscores
### addition
1. anthropic/claude-3.5-sonnet: 1888ms (1.89s)
2. qwen/qwq-32b: 6304ms (6.30s)
### multiplication
1. anthropic/claude-3.5-sonnet: 1291ms (1.29s)
2. qwen/qwq-32b: 2225ms (2.23s)
### division
1. anthropic/claude-3.5-sonnet: 1209ms (1.21s)
2. qwen/qwq-32b: 7852ms (7.85s)
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| division | openai/gpt-3.5-turbo | 794 | 0.79 |
| multiplication | openai/gpt-3.5-turbo | 870 | 0.87 |
| division | openai/gpt-4o-mini | 929 | 0.93 |
| multiplication | anthropic/claude-3.5-sonnet | 1096 | 1.10 |
| division | anthropic/claude-3.5-sonnet | 1276 | 1.28 |
| addition | openai/gpt-4o-mini | 1376 | 1.38 |
| multiplication | openai/gpt-4o-mini | 1444 | 1.44 |
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 1790 | 1.79 |
| addition | anthropic/claude-3.5-sonnet | 1951 | 1.95 |
| multiplication | qwen/qwq-32b | 3592 | 3.59 |
| addition | qwen/qwq-32b | 3726 | 3.73 |
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 5646 | 5.65 |
| division | qwen/qwq-32b | 5768 | 5.77 |
| addition | openai/gpt-3.5-turbo | 7188 | 7.19 |
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 7531 | 7.53 |
| multiplication | deepseek/deepseek-r1 | 10983 | 10.98 |
| addition | deepseek/deepseek-r1 | 15157 | 15.16 |
| division | deepseek/deepseek-r1 | 15409 | 15.41 |
## Summary
- Total Tests: 6
- Passed: 6
- Total Tests: 18
- Passed: 18
- Failed: 0
- Success Rate: 100.00%
- Average Duration: 3462ms (3.46s)
- Average Duration: 4807ms (4.81s)
## Failed Tests
@ -29,44 +38,146 @@
## Passed Tests
### addition - anthropic/claude-3.5-sonnet
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1888ms (1888.00s)
- Timestamp: 4/2/2025, 1:01:08 PM
- Duration: 1951ms (1951.00s)
- Timestamp: 4/2/2025, 1:25:15 PM
### addition - qwen/qwq-32b
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 6304ms (6304.00s)
- Timestamp: 4/2/2025, 1:01:15 PM
- Duration: 3726ms (3726.00s)
- Timestamp: 4/2/2025, 1:25:19 PM
### addition - openai/gpt-4o-mini
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1376ms (1376.00s)
- Timestamp: 4/2/2025, 1:25:20 PM
### addition - openai/gpt-3.5-turbo
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 7188ms (7188.00s)
- Timestamp: 4/2/2025, 1:25:28 PM
### addition - deepseek/deepseek-r1
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 15157ms (15157.00s)
- Timestamp: 4/2/2025, 1:25:43 PM
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 7531ms (7531.00s)
- Timestamp: 4/2/2025, 1:25:50 PM
### multiplication - anthropic/claude-3.5-sonnet
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1291ms (1291.00s)
- Timestamp: 4/2/2025, 1:01:16 PM
- Duration: 1096ms (1096.00s)
- Timestamp: 4/2/2025, 1:25:51 PM
### multiplication - qwen/qwq-32b
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 2225ms (2225.00s)
- Timestamp: 4/2/2025, 1:01:18 PM
- Duration: 3592ms (3592.00s)
- Timestamp: 4/2/2025, 1:25:55 PM
### multiplication - openai/gpt-4o-mini
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1444ms (1444.00s)
- Timestamp: 4/2/2025, 1:25:56 PM
### multiplication - openai/gpt-3.5-turbo
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 870ms (870.00s)
- Timestamp: 4/2/2025, 1:25:57 PM
### multiplication - deepseek/deepseek-r1
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 10983ms (10983.00s)
- Timestamp: 4/2/2025, 1:26:08 PM
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 5646ms (5646.00s)
- Timestamp: 4/2/2025, 1:26:14 PM
### division - anthropic/claude-3.5-sonnet
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1209ms (1209.00s)
- Timestamp: 4/2/2025, 1:01:19 PM
- Duration: 1276ms (1276.00s)
- Timestamp: 4/2/2025, 1:26:15 PM
### division - qwen/qwq-32b
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 7852ms (7852.00s)
- Timestamp: 4/2/2025, 1:01:27 PM
- Duration: 5768ms (5768.00s)
- Timestamp: 4/2/2025, 1:26:21 PM
### division - openai/gpt-4o-mini
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 929ms (929.00s)
- Timestamp: 4/2/2025, 1:26:22 PM
### division - openai/gpt-3.5-turbo
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 794ms (794.00s)
- Timestamp: 4/2/2025, 1:26:23 PM
### division - deepseek/deepseek-r1
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 15409ms (15409.00s)
- Timestamp: 4/2/2025, 1:26:38 PM
### division - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1790ms (1790.00s)
- Timestamp: 4/2/2025, 1:26:40 PM

View File

@ -0,0 +1,700 @@
{
"results": [
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:09:36.865Z",
"passed": false,
"duration": 1944,
"reason": "Expected -3,-2, but got -2,-3"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-3,-2"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:09:47.481Z",
"passed": true,
"duration": 10608
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:09:49.153Z",
"passed": true,
"duration": 1671
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [],
"expected": "120",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:03.043Z",
"passed": false,
"duration": 13889,
"reason": "Model returned empty response"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:03.988Z",
"passed": true,
"duration": 943
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:05.723Z",
"passed": false,
"duration": 1734,
"reason": "Expected 8, but got 5"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:07.465Z",
"passed": true,
"duration": 1739
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:13.671Z",
"passed": true,
"duration": 6205
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:14.967Z",
"passed": true,
"duration": 1295
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:20.932Z",
"passed": true,
"duration": 5964
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:10.276Z",
"passed": false,
"duration": 1242,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:31.650Z",
"passed": false,
"duration": 21368,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:34.699Z",
"passed": true,
"duration": 3046,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:45.957Z",
"passed": true,
"duration": 11256,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:47.935Z",
"passed": true,
"duration": 1976,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:07.714Z",
"passed": false,
"duration": 19778,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:08.883Z",
"passed": true,
"duration": 1167,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:12.225Z",
"passed": true,
"duration": 3341,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:12.889Z",
"passed": true,
"duration": 663,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:32.527Z",
"passed": true,
"duration": 19636,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:10.419Z",
"passed": false,
"duration": 1650,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-3,-2"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:20.647Z",
"passed": true,
"duration": 10222,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:21.643Z",
"passed": false,
"duration": 994,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:22.524Z",
"passed": false,
"duration": 878,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:23.496Z",
"passed": true,
"duration": 970,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:28.452Z",
"passed": true,
"duration": 4954,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:29.325Z",
"passed": true,
"duration": 872,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:30.109Z",
"passed": true,
"duration": 782,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:33.902Z",
"passed": true,
"duration": 3791,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:46.225Z",
"passed": false,
"duration": 12322,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:47.202Z",
"passed": false,
"duration": 974,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:48.005Z",
"passed": true,
"duration": 800,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:48.763Z",
"passed": true,
"duration": 756,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:55.510Z",
"passed": true,
"duration": 6745,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:56.297Z",
"passed": true,
"duration": 785,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:57.051Z",
"passed": true,
"duration": 751,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:58.294Z",
"passed": true,
"duration": 1241,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:23:04.551Z",
"passed": false,
"duration": 6255,
"reason": "Model returned empty response",
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:23:05.297Z",
"passed": true,
"duration": 743,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:23:06.018Z",
"passed": true,
"duration": 719,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:32.237Z",
"passed": false,
"duration": 1533,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-3, -2"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:50.178Z",
"passed": false,
"duration": 17934,
"reason": "Expected -3,-2, but got -3, -2",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:51.040Z",
"passed": false,
"duration": 859,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:51.938Z",
"passed": false,
"duration": 895,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
}
],
"highscores": [
{
"test": "quadratic",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 859,
"duration_secs": 0.859
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 895,
"duration_secs": 0.895
}
]
},
{
"test": "factorial",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 782,
"duration_secs": 0.782
},
{
"model": "openai/gpt-4o-mini",
"duration": 872,
"duration_secs": 0.872
}
]
},
{
"test": "fibonacci",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 800,
"duration_secs": 0.8
},
{
"model": "openai/gpt-4o-mini",
"duration": 974,
"duration_secs": 0.974
}
]
},
{
"test": "square_root",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 751,
"duration_secs": 0.751
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 756,
"duration_secs": 0.756
}
]
},
{
"test": "power",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 719,
"duration_secs": 0.719
},
{
"model": "openai/gpt-4o-mini",
"duration": 743,
"duration_secs": 0.743
}
]
}
],
"lastUpdated": "2025-04-02T11:24:51.939Z"
}

View File

@ -0,0 +1,206 @@
# Math Operations Test Results
## Highscores
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| power | openai/gpt-3.5-turbo | 719 | 0.72 |
| power | openai/gpt-4o-mini | 743 | 0.74 |
| square_root | openai/gpt-3.5-turbo | 751 | 0.75 |
| square_root | anthropic/claude-3.5-sonnet | 756 | 0.76 |
| factorial | openai/gpt-3.5-turbo | 782 | 0.78 |
| square_root | openai/gpt-4o-mini | 785 | 0.79 |
| fibonacci | openai/gpt-3.5-turbo | 800 | 0.80 |
| factorial | openai/gpt-4o-mini | 872 | 0.87 |
| quadratic | openai/gpt-3.5-turbo | 878 | 0.88 |
| factorial | anthropic/claude-3.5-sonnet | 970 | 0.97 |
| fibonacci | openai/gpt-4o-mini | 974 | 0.97 |
| quadratic | openai/gpt-4o-mini | 994 | 0.99 |
| power | anthropic/claude-3.5-sonnet | 1241 | 1.24 |
| quadratic | anthropic/claude-3.5-sonnet | 1650 | 1.65 |
| fibonacci | anthropic/claude-3.5-sonnet | 3791 | 3.79 |
| factorial | qwen/qwq-32b | 4954 | 4.95 |
| power | qwen/qwq-32b | 6255 | 6.25 |
| square_root | qwen/qwq-32b | 6745 | 6.75 |
| quadratic | qwen/qwq-32b | 10222 | 10.22 |
| fibonacci | qwen/qwq-32b | 12322 | 12.32 |
## Summary
- Total Tests: 20
- Passed: 14
- Failed: 6
- Success Rate: 70.00%
- Average Duration: 2860ms (2.86s)
## Failed Tests
### quadratic - anthropic/claude-3.5-sonnet
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2,-3`
- Duration: 1650ms (1650.00s)
- Reason: Expected -3,-2, but got -2,-3
- Timestamp: 4/2/2025, 1:22:10 PM
### quadratic - openai/gpt-4o-mini
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2, -3`
- Duration: 994ms (994.00s)
- Reason: Expected -3,-2, but got -2, -3
- Timestamp: 4/2/2025, 1:22:21 PM
### quadratic - openai/gpt-3.5-turbo
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2, -3`
- Duration: 878ms (878.00s)
- Reason: Expected -3,-2, but got -2, -3
- Timestamp: 4/2/2025, 1:22:22 PM
### fibonacci - qwen/qwq-32b
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `5`
- Duration: 12322ms (12322.00s)
- Reason: Expected 8, but got 5
- Timestamp: 4/2/2025, 1:22:46 PM
### fibonacci - openai/gpt-4o-mini
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `5`
- Duration: 974ms (974.00s)
- Reason: Expected 8, but got 5
- Timestamp: 4/2/2025, 1:22:47 PM
### power - qwen/qwq-32b
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: ``
- Duration: 6255ms (6255.00s)
- Reason: Model returned empty response
- Timestamp: 4/2/2025, 1:23:04 PM
## Passed Tests
### quadratic - qwen/qwq-32b
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-3,-2`
- Duration: 10222ms (10222.00s)
- Timestamp: 4/2/2025, 1:22:20 PM
### factorial - anthropic/claude-3.5-sonnet
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 970ms (970.00s)
- Timestamp: 4/2/2025, 1:22:23 PM
### factorial - qwen/qwq-32b
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 4954ms (4954.00s)
- Timestamp: 4/2/2025, 1:22:28 PM
### factorial - openai/gpt-4o-mini
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 872ms (872.00s)
- Timestamp: 4/2/2025, 1:22:29 PM
### factorial - openai/gpt-3.5-turbo
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 782ms (782.00s)
- Timestamp: 4/2/2025, 1:22:30 PM
### fibonacci - anthropic/claude-3.5-sonnet
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 3791ms (3791.00s)
- Timestamp: 4/2/2025, 1:22:33 PM
### fibonacci - openai/gpt-3.5-turbo
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 800ms (800.00s)
- Timestamp: 4/2/2025, 1:22:48 PM
### square_root - anthropic/claude-3.5-sonnet
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 756ms (756.00s)
- Timestamp: 4/2/2025, 1:22:48 PM
### square_root - qwen/qwq-32b
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 6745ms (6745.00s)
- Timestamp: 4/2/2025, 1:22:55 PM
### square_root - openai/gpt-4o-mini
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 785ms (785.00s)
- Timestamp: 4/2/2025, 1:22:56 PM
### square_root - openai/gpt-3.5-turbo
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 751ms (751.00s)
- Timestamp: 4/2/2025, 1:22:57 PM
### power - anthropic/claude-3.5-sonnet
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1241ms (1241.00s)
- Timestamp: 4/2/2025, 1:22:58 PM
### power - openai/gpt-4o-mini
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 743ms (743.00s)
- Timestamp: 4/2/2025, 1:23:05 PM
### power - openai/gpt-3.5-turbo
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 719ms (719.00s)
- Timestamp: 4/2/2025, 1:23:06 PM