mono/packages/kbot/tests/unit/reports/basic.json

248 lines
6.2 KiB
JSON

{
"results": [
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:08.012Z",
"passed": true,
"duration": 771,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:08.530Z",
"passed": true,
"duration": 514,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:09.157Z",
"passed": true,
"duration": 624,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:09.880Z",
"passed": true,
"duration": 721,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:10.395Z",
"passed": true,
"duration": 513,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:11.292Z",
"passed": true,
"duration": 895,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:11.514Z",
"passed": false,
"duration": 220,
"reason": "Model returned empty response",
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"Yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:15.879Z",
"passed": true,
"duration": 4358,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:39.121Z",
"passed": true,
"duration": 1838,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:39.863Z",
"passed": true,
"duration": 738,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:56:43.097Z",
"passed": true,
"duration": 3231,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:44.836Z",
"passed": true,
"duration": 1737,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:45.465Z",
"passed": true,
"duration": 626,
"category": "basic"
}
],
"highscores": [
{
"test": "addition",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 738,
"duration_secs": 0.738
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 771,
"duration_secs": 0.771
}
]
},
{
"test": "multiplication",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 624,
"duration_secs": 0.624
},
{
"model": "openai/gpt-4o-mini",
"duration": 626,
"duration_secs": 0.626
}
]
},
{
"test": "division",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 513,
"duration_secs": 0.513
},
{
"model": "openai/gpt-4o-mini",
"duration": 895,
"duration_secs": 0.895
}
]
},
{
"test": "web_content",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 220,
"duration_secs": 0.22
},
{
"model": "openai/gpt-4o-mini",
"duration": 4358,
"duration_secs": 4.358
}
]
}
],
"lastUpdated": "2025-06-05T18:56:45.466Z"
}