test report commons
This commit is contained in:
parent
1c3a2d981e
commit
bc644f8635
@ -1,5 +1,5 @@
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -9,7 +9,11 @@ import { sync as mkdirp } from "mkdirp"
|
||||
export const getDefaultModels = () => [
|
||||
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
|
||||
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B
|
||||
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B,
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
|
||||
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1,
|
||||
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE
|
||||
]
|
||||
|
||||
export const isOpenRouterModel = (model: string): boolean => {
|
||||
@ -51,6 +55,7 @@ export interface TestResult {
|
||||
details?: any;
|
||||
};
|
||||
duration?: number
|
||||
category?: string;
|
||||
}
|
||||
|
||||
export interface TestHighscore {
|
||||
@ -108,7 +113,7 @@ export const runTest = async (
|
||||
logPath: string
|
||||
): Promise<TestResult> => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let router = 'openrouter'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
let testResult: TestResult | undefined
|
||||
@ -124,7 +129,7 @@ export const runTest = async (
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = isOpenRouterModel(model) ? 'openrouter' : 'unknown'
|
||||
router = options.model as string
|
||||
return options
|
||||
}
|
||||
}),
|
||||
@ -181,6 +186,13 @@ export const runTest = async (
|
||||
throw e
|
||||
} finally {
|
||||
if (testResult) {
|
||||
// Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
|
||||
const category = path.basename(logPath, path.extname(logPath))
|
||||
|
||||
// Add category to test result
|
||||
testResult.category = category
|
||||
|
||||
// Update category-specific log
|
||||
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
|
||||
const updatedResults = [...(existingData.results || []), testResult]
|
||||
|
||||
@ -200,12 +212,40 @@ export const runTest = async (
|
||||
// Generate highscores
|
||||
const highscores = generateHighscores(latestResults)
|
||||
|
||||
// Write both results and highscores
|
||||
// Write category-specific results
|
||||
write(logPath, JSON.stringify({
|
||||
results: updatedResults,
|
||||
highscores,
|
||||
lastUpdated: new Date().toISOString()
|
||||
}, null, 2))
|
||||
|
||||
// Update central all.json log
|
||||
const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
|
||||
const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
|
||||
const allUpdatedResults = [...(allExistingData.results || []), testResult]
|
||||
|
||||
// Group all results by test and model
|
||||
const allLatestResults = new Map<string, Map<string, TestResult>>()
|
||||
allUpdatedResults.forEach(result => {
|
||||
if (!allLatestResults.has(result.test)) {
|
||||
allLatestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = allLatestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate highscores for all results
|
||||
const allHighscores = generateHighscores(allLatestResults)
|
||||
|
||||
// Write all results
|
||||
write(allLogPath, JSON.stringify({
|
||||
results: allUpdatedResults,
|
||||
highscores: allHighscores,
|
||||
lastUpdated: new Date().toISOString()
|
||||
}, null, 2))
|
||||
}
|
||||
}
|
||||
return testResult
|
||||
@ -236,25 +276,29 @@ export const generateTestReport = (
|
||||
|
||||
// Add highscore section
|
||||
report += '## Highscores\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
report += `### ${testName}\n`
|
||||
|
||||
// Convert model results to array and sort by duration
|
||||
const sortedResults = Array.from(modelResults.entries())
|
||||
.map(([model, result]) => ({ model, result }))
|
||||
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
|
||||
.slice(0, 2) // Get top 2
|
||||
|
||||
// Create a table header
|
||||
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
|
||||
report += '|------|-------|--------------|--------------|\n'
|
||||
|
||||
// Sort all results by duration
|
||||
const allResults = Array.from(latestResults.entries())
|
||||
.flatMap(([testName, modelResults]) =>
|
||||
Array.from(modelResults.entries())
|
||||
.map(([model, result]) => ({
|
||||
test: testName,
|
||||
model,
|
||||
duration: result.duration || 0
|
||||
}))
|
||||
)
|
||||
.sort((a, b) => a.duration - b.duration)
|
||||
|
||||
if (sortedResults.length > 0) {
|
||||
sortedResults.forEach(({ model, result }, index) => {
|
||||
const duration = result.duration || 0
|
||||
report += `${index + 1}. ${model}: ${duration}ms (${(duration / 1000).toFixed(2)}s)\n`
|
||||
})
|
||||
} else {
|
||||
report += '*No results available*\n'
|
||||
}
|
||||
report += '\n'
|
||||
}
|
||||
// Add all results to the table
|
||||
allResults.forEach(({ test, model, duration }) => {
|
||||
report += `| ${test} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
|
||||
})
|
||||
|
||||
report += '\n'
|
||||
|
||||
// Add summary section
|
||||
report += '## Summary\n\n'
|
||||
@ -275,7 +319,7 @@ export const generateTestReport = (
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `### ${testName} - ${model}\n\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
@ -305,7 +349,7 @@ export const generateTestReport = (
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `### ${testName} - ${model}\n\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
|
||||
1202
packages/kbot/tests/unit/reports/all.json
Normal file
1202
packages/kbot/tests/unit/reports/all.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -155,6 +155,594 @@
|
||||
"timestamp": "2025-04-02T11:01:27.791Z",
|
||||
"passed": true,
|
||||
"duration": 7852
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:21.370Z",
|
||||
"passed": true,
|
||||
"duration": 1213,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:24.898Z",
|
||||
"passed": true,
|
||||
"duration": 3524,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:25.624Z",
|
||||
"passed": true,
|
||||
"duration": 724,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:26.630Z",
|
||||
"passed": true,
|
||||
"duration": 1005,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:27.812Z",
|
||||
"passed": true,
|
||||
"duration": 1178,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:31.317Z",
|
||||
"passed": true,
|
||||
"duration": 3503,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:32.288Z",
|
||||
"passed": true,
|
||||
"duration": 969,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:33.147Z",
|
||||
"passed": true,
|
||||
"duration": 858,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:33.724Z",
|
||||
"passed": true,
|
||||
"duration": 576,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:34.841Z",
|
||||
"passed": true,
|
||||
"duration": 1115,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:35.673Z",
|
||||
"passed": true,
|
||||
"duration": 831,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:16:36.762Z",
|
||||
"passed": true,
|
||||
"duration": 1087,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:25.749Z",
|
||||
"passed": true,
|
||||
"duration": 1644,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:31.261Z",
|
||||
"passed": true,
|
||||
"duration": 5507,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:32.131Z",
|
||||
"passed": true,
|
||||
"duration": 869,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:33.306Z",
|
||||
"passed": true,
|
||||
"duration": 1173,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:34.323Z",
|
||||
"passed": true,
|
||||
"duration": 1016,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:38.976Z",
|
||||
"passed": true,
|
||||
"duration": 4651,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:39.914Z",
|
||||
"passed": true,
|
||||
"duration": 937,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:41.053Z",
|
||||
"passed": true,
|
||||
"duration": 1137,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:42.918Z",
|
||||
"passed": true,
|
||||
"duration": 1863,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:47.234Z",
|
||||
"passed": true,
|
||||
"duration": 4314,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:47.966Z",
|
||||
"passed": true,
|
||||
"duration": 730,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:20:48.941Z",
|
||||
"passed": true,
|
||||
"duration": 973,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:15.745Z",
|
||||
"passed": true,
|
||||
"duration": 1951,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:19.476Z",
|
||||
"passed": true,
|
||||
"duration": 3726,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:20.854Z",
|
||||
"passed": true,
|
||||
"duration": 1376,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:28.044Z",
|
||||
"passed": true,
|
||||
"duration": 7188,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:43.203Z",
|
||||
"passed": true,
|
||||
"duration": 15157,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:50.736Z",
|
||||
"passed": true,
|
||||
"duration": 7531,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:51.834Z",
|
||||
"passed": true,
|
||||
"duration": 1096,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:55.428Z",
|
||||
"passed": true,
|
||||
"duration": 3592,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:56.874Z",
|
||||
"passed": true,
|
||||
"duration": 1444,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:25:57.746Z",
|
||||
"passed": true,
|
||||
"duration": 870,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:08.731Z",
|
||||
"passed": true,
|
||||
"duration": 10983,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:14.379Z",
|
||||
"passed": true,
|
||||
"duration": 5646,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:15.658Z",
|
||||
"passed": true,
|
||||
"duration": 1276,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:21.428Z",
|
||||
"passed": true,
|
||||
"duration": 5768,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:22.358Z",
|
||||
"passed": true,
|
||||
"duration": 929,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:23.155Z",
|
||||
"passed": true,
|
||||
"duration": 794,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:38.566Z",
|
||||
"passed": true,
|
||||
"duration": 15409,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:26:40.358Z",
|
||||
"passed": true,
|
||||
"duration": 1790,
|
||||
"category": "basic"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -162,14 +750,14 @@
|
||||
"test": "addition",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1888,
|
||||
"duration_secs": 1.888
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1376,
|
||||
"duration_secs": 1.376
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 6304,
|
||||
"duration_secs": 6.304
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1951,
|
||||
"duration_secs": 1.951
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -177,14 +765,14 @@
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1291,
|
||||
"duration_secs": 1.291
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 870,
|
||||
"duration_secs": 0.87
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 2225,
|
||||
"duration_secs": 2.225
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1096,
|
||||
"duration_secs": 1.096
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -192,17 +780,17 @@
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1209,
|
||||
"duration_secs": 1.209
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 794,
|
||||
"duration_secs": 0.794
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 7852,
|
||||
"duration_secs": 7.852
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 929,
|
||||
"duration_secs": 0.929
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T11:01:27.792Z"
|
||||
"lastUpdated": "2025-04-02T11:26:40.358Z"
|
||||
}
|
||||
@ -1,72 +0,0 @@
|
||||
# Basic Operations Test Results
|
||||
|
||||
## Highscores
|
||||
|
||||
### addition
|
||||
1. anthropic/claude-3.5-sonnet: 1237ms (1.24s)
|
||||
2. qwen/qwq-32b: 4298ms (4.30s)
|
||||
|
||||
### multiplication
|
||||
1. anthropic/claude-3.5-sonnet: 1411ms (1.41s)
|
||||
2. qwen/qwq-32b: 3122ms (3.12s)
|
||||
|
||||
### division
|
||||
1. anthropic/claude-3.5-sonnet: 583ms (0.58s)
|
||||
2. qwen/qwq-32b: 3615ms (3.62s)
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 6
|
||||
- Passed: 6
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Average Duration: 2378ms (2.38s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
*No failed tests*
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### addition - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1237ms (1237.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:09 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 4298ms (4298.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:13 PM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1411ms (1411.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:15 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 3122ms (3122.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:18 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 583ms (583.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:18 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 3615ms (3615.00s)
|
||||
- Timestamp: 4/2/2025, 12:56:22 PM
|
||||
|
||||
@ -2,25 +2,34 @@
|
||||
|
||||
## Highscores
|
||||
|
||||
### addition
|
||||
1. anthropic/claude-3.5-sonnet: 1888ms (1.89s)
|
||||
2. qwen/qwq-32b: 6304ms (6.30s)
|
||||
|
||||
### multiplication
|
||||
1. anthropic/claude-3.5-sonnet: 1291ms (1.29s)
|
||||
2. qwen/qwq-32b: 2225ms (2.23s)
|
||||
|
||||
### division
|
||||
1. anthropic/claude-3.5-sonnet: 1209ms (1.21s)
|
||||
2. qwen/qwq-32b: 7852ms (7.85s)
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| division | openai/gpt-3.5-turbo | 794 | 0.79 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 870 | 0.87 |
|
||||
| division | openai/gpt-4o-mini | 929 | 0.93 |
|
||||
| multiplication | anthropic/claude-3.5-sonnet | 1096 | 1.10 |
|
||||
| division | anthropic/claude-3.5-sonnet | 1276 | 1.28 |
|
||||
| addition | openai/gpt-4o-mini | 1376 | 1.38 |
|
||||
| multiplication | openai/gpt-4o-mini | 1444 | 1.44 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 1790 | 1.79 |
|
||||
| addition | anthropic/claude-3.5-sonnet | 1951 | 1.95 |
|
||||
| multiplication | qwen/qwq-32b | 3592 | 3.59 |
|
||||
| addition | qwen/qwq-32b | 3726 | 3.73 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 5646 | 5.65 |
|
||||
| division | qwen/qwq-32b | 5768 | 5.77 |
|
||||
| addition | openai/gpt-3.5-turbo | 7188 | 7.19 |
|
||||
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 7531 | 7.53 |
|
||||
| multiplication | deepseek/deepseek-r1 | 10983 | 10.98 |
|
||||
| addition | deepseek/deepseek-r1 | 15157 | 15.16 |
|
||||
| division | deepseek/deepseek-r1 | 15409 | 15.41 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 6
|
||||
- Passed: 6
|
||||
- Total Tests: 18
|
||||
- Passed: 18
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Average Duration: 3462ms (3.46s)
|
||||
- Average Duration: 4807ms (4.81s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
@ -29,44 +38,146 @@
|
||||
## Passed Tests
|
||||
|
||||
### addition - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1888ms (1888.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:08 PM
|
||||
- Duration: 1951ms (1951.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:15 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 6304ms (6304.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:15 PM
|
||||
- Duration: 3726ms (3726.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:19 PM
|
||||
|
||||
### addition - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1376ms (1376.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:20 PM
|
||||
|
||||
### addition - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 7188ms (7188.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:28 PM
|
||||
|
||||
### addition - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 15157ms (15157.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:43 PM
|
||||
|
||||
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 7531ms (7531.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:50 PM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1291ms (1291.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:16 PM
|
||||
- Duration: 1096ms (1096.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:51 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 2225ms (2225.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:18 PM
|
||||
- Duration: 3592ms (3592.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:55 PM
|
||||
|
||||
### multiplication - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1444ms (1444.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:56 PM
|
||||
|
||||
### multiplication - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 870ms (870.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:57 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 10983ms (10983.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:08 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 5646ms (5646.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:14 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1209ms (1209.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:19 PM
|
||||
- Duration: 1276ms (1276.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:15 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 7852ms (7852.00s)
|
||||
- Timestamp: 4/2/2025, 1:01:27 PM
|
||||
- Duration: 5768ms (5768.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:21 PM
|
||||
|
||||
### division - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 929ms (929.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:22 PM
|
||||
|
||||
### division - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 794ms (794.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:23 PM
|
||||
|
||||
### division - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 15409ms (15409.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:38 PM
|
||||
|
||||
### division - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1790ms (1790.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:40 PM
|
||||
|
||||
|
||||
700
packages/kbot/tests/unit/reports/math.json
Normal file
700
packages/kbot/tests/unit/reports/math.json
Normal file
@ -0,0 +1,700 @@
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:09:36.865Z",
|
||||
"passed": false,
|
||||
"duration": 1944,
|
||||
"reason": "Expected -3,-2, but got -2,-3"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-3,-2"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:09:47.481Z",
|
||||
"passed": true,
|
||||
"duration": 10608
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:09:49.153Z",
|
||||
"passed": true,
|
||||
"duration": 1671
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "120",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:03.043Z",
|
||||
"passed": false,
|
||||
"duration": 13889,
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:03.988Z",
|
||||
"passed": true,
|
||||
"duration": 943
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:05.723Z",
|
||||
"passed": false,
|
||||
"duration": 1734,
|
||||
"reason": "Expected 8, but got 5"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:07.465Z",
|
||||
"passed": true,
|
||||
"duration": 1739
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:13.671Z",
|
||||
"passed": true,
|
||||
"duration": 6205
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:14.967Z",
|
||||
"passed": true,
|
||||
"duration": 1295
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:10:20.932Z",
|
||||
"passed": true,
|
||||
"duration": 5964
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:13:10.276Z",
|
||||
"passed": false,
|
||||
"duration": 1242,
|
||||
"reason": "Expected -3,-2, but got -2,-3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:13:31.650Z",
|
||||
"passed": false,
|
||||
"duration": 21368,
|
||||
"reason": "Expected -3,-2, but got -2, -3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:13:34.699Z",
|
||||
"passed": true,
|
||||
"duration": 3046,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:13:45.957Z",
|
||||
"passed": true,
|
||||
"duration": 11256,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:13:47.935Z",
|
||||
"passed": true,
|
||||
"duration": 1976,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:14:07.714Z",
|
||||
"passed": false,
|
||||
"duration": 19778,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:14:08.883Z",
|
||||
"passed": true,
|
||||
"duration": 1167,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:14:12.225Z",
|
||||
"passed": true,
|
||||
"duration": 3341,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:14:12.889Z",
|
||||
"passed": true,
|
||||
"duration": 663,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:14:32.527Z",
|
||||
"passed": true,
|
||||
"duration": 19636,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:10.419Z",
|
||||
"passed": false,
|
||||
"duration": 1650,
|
||||
"reason": "Expected -3,-2, but got -2,-3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-3,-2"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:20.647Z",
|
||||
"passed": true,
|
||||
"duration": 10222,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:21.643Z",
|
||||
"passed": false,
|
||||
"duration": 994,
|
||||
"reason": "Expected -3,-2, but got -2, -3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:22.524Z",
|
||||
"passed": false,
|
||||
"duration": 878,
|
||||
"reason": "Expected -3,-2, but got -2, -3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:23.496Z",
|
||||
"passed": true,
|
||||
"duration": 970,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:28.452Z",
|
||||
"passed": true,
|
||||
"duration": 4954,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:29.325Z",
|
||||
"passed": true,
|
||||
"duration": 872,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:30.109Z",
|
||||
"passed": true,
|
||||
"duration": 782,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:33.902Z",
|
||||
"passed": true,
|
||||
"duration": 3791,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:46.225Z",
|
||||
"passed": false,
|
||||
"duration": 12322,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:47.202Z",
|
||||
"passed": false,
|
||||
"duration": 974,
|
||||
"reason": "Expected 8, but got 5",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:48.005Z",
|
||||
"passed": true,
|
||||
"duration": 800,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:48.763Z",
|
||||
"passed": true,
|
||||
"duration": 756,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:55.510Z",
|
||||
"passed": true,
|
||||
"duration": 6745,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:56.297Z",
|
||||
"passed": true,
|
||||
"duration": 785,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"4"
|
||||
],
|
||||
"expected": "4",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:57.051Z",
|
||||
"passed": true,
|
||||
"duration": 751,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:22:58.294Z",
|
||||
"passed": true,
|
||||
"duration": 1241,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:23:04.551Z",
|
||||
"passed": false,
|
||||
"duration": 6255,
|
||||
"reason": "Model returned empty response",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:23:05.297Z",
|
||||
"passed": true,
|
||||
"duration": 743,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:23:06.018Z",
|
||||
"passed": true,
|
||||
"duration": 719,
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2,-3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:24:32.237Z",
|
||||
"passed": false,
|
||||
"duration": 1533,
|
||||
"reason": "Expected -3,-2, but got -2,-3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-3, -2"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:24:50.178Z",
|
||||
"passed": false,
|
||||
"duration": 17934,
|
||||
"reason": "Expected -3,-2, but got -3, -2",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:24:51.040Z",
|
||||
"passed": false,
|
||||
"duration": 859,
|
||||
"reason": "Expected -3,-2, but got -2, -3",
|
||||
"category": "math"
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
||||
"result": [
|
||||
"-2, -3"
|
||||
],
|
||||
"expected": "-3,-2",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T11:24:51.938Z",
|
||||
"passed": false,
|
||||
"duration": 895,
|
||||
"reason": "Expected -3,-2, but got -2, -3",
|
||||
"category": "math"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
{
|
||||
"test": "quadratic",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 859,
|
||||
"duration_secs": 0.859
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 895,
|
||||
"duration_secs": 0.895
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 782,
|
||||
"duration_secs": 0.782
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 872,
|
||||
"duration_secs": 0.872
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 800,
|
||||
"duration_secs": 0.8
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 974,
|
||||
"duration_secs": 0.974
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "square_root",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 751,
|
||||
"duration_secs": 0.751
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 756,
|
||||
"duration_secs": 0.756
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "power",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 719,
|
||||
"duration_secs": 0.719
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 743,
|
||||
"duration_secs": 0.743
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T11:24:51.939Z"
|
||||
}
|
||||
206
packages/kbot/tests/unit/reports/math.md
Normal file
206
packages/kbot/tests/unit/reports/math.md
Normal file
@ -0,0 +1,206 @@
|
||||
# Math Operations Test Results
|
||||
|
||||
|
||||
## Highscores
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| power | openai/gpt-3.5-turbo | 719 | 0.72 |
|
||||
| power | openai/gpt-4o-mini | 743 | 0.74 |
|
||||
| square_root | openai/gpt-3.5-turbo | 751 | 0.75 |
|
||||
| square_root | anthropic/claude-3.5-sonnet | 756 | 0.76 |
|
||||
| factorial | openai/gpt-3.5-turbo | 782 | 0.78 |
|
||||
| square_root | openai/gpt-4o-mini | 785 | 0.79 |
|
||||
| fibonacci | openai/gpt-3.5-turbo | 800 | 0.80 |
|
||||
| factorial | openai/gpt-4o-mini | 872 | 0.87 |
|
||||
| quadratic | openai/gpt-3.5-turbo | 878 | 0.88 |
|
||||
| factorial | anthropic/claude-3.5-sonnet | 970 | 0.97 |
|
||||
| fibonacci | openai/gpt-4o-mini | 974 | 0.97 |
|
||||
| quadratic | openai/gpt-4o-mini | 994 | 0.99 |
|
||||
| power | anthropic/claude-3.5-sonnet | 1241 | 1.24 |
|
||||
| quadratic | anthropic/claude-3.5-sonnet | 1650 | 1.65 |
|
||||
| fibonacci | anthropic/claude-3.5-sonnet | 3791 | 3.79 |
|
||||
| factorial | qwen/qwq-32b | 4954 | 4.95 |
|
||||
| power | qwen/qwq-32b | 6255 | 6.25 |
|
||||
| square_root | qwen/qwq-32b | 6745 | 6.75 |
|
||||
| quadratic | qwen/qwq-32b | 10222 | 10.22 |
|
||||
| fibonacci | qwen/qwq-32b | 12322 | 12.32 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 20
|
||||
- Passed: 14
|
||||
- Failed: 6
|
||||
- Success Rate: 70.00%
|
||||
- Average Duration: 2860ms (2.86s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### quadratic - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2,-3`
|
||||
- Duration: 1650ms (1650.00s)
|
||||
- Reason: Expected -3,-2, but got -2,-3
|
||||
- Timestamp: 4/2/2025, 1:22:10 PM
|
||||
|
||||
### quadratic - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2, -3`
|
||||
- Duration: 994ms (994.00s)
|
||||
- Reason: Expected -3,-2, but got -2, -3
|
||||
- Timestamp: 4/2/2025, 1:22:21 PM
|
||||
|
||||
### quadratic - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2, -3`
|
||||
- Duration: 878ms (878.00s)
|
||||
- Reason: Expected -3,-2, but got -2, -3
|
||||
- Timestamp: 4/2/2025, 1:22:22 PM
|
||||
|
||||
### fibonacci - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `5`
|
||||
- Duration: 12322ms (12322.00s)
|
||||
- Reason: Expected 8, but got 5
|
||||
- Timestamp: 4/2/2025, 1:22:46 PM
|
||||
|
||||
### fibonacci - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `5`
|
||||
- Duration: 974ms (974.00s)
|
||||
- Reason: Expected 8, but got 5
|
||||
- Timestamp: 4/2/2025, 1:22:47 PM
|
||||
|
||||
### power - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 6255ms (6255.00s)
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/2/2025, 1:23:04 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### quadratic - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-3,-2`
|
||||
- Duration: 10222ms (10222.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:20 PM
|
||||
|
||||
### factorial - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 970ms (970.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:23 PM
|
||||
|
||||
### factorial - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 4954ms (4954.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:28 PM
|
||||
|
||||
### factorial - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 872ms (872.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:29 PM
|
||||
|
||||
### factorial - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 782ms (782.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:30 PM
|
||||
|
||||
### fibonacci - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 3791ms (3791.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:33 PM
|
||||
|
||||
### fibonacci - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 800ms (800.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:48 PM
|
||||
|
||||
### square_root - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 756ms (756.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:48 PM
|
||||
|
||||
### square_root - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 6745ms (6745.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:55 PM
|
||||
|
||||
### square_root - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 785ms (785.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:56 PM
|
||||
|
||||
### square_root - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 751ms (751.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:57 PM
|
||||
|
||||
### power - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1241ms (1241.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:58 PM
|
||||
|
||||
### power - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 743ms (743.00s)
|
||||
- Timestamp: 4/2/2025, 1:23:05 PM
|
||||
|
||||
### power - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 719ms (719.00s)
|
||||
- Timestamp: 4/2/2025, 1:23:06 PM
|
||||
|
||||
Loading…
Reference in New Issue
Block a user