mono/packages/kbot/tests/unit/reports/basic.json

415 lines
11 KiB
JSON

{
"results": [
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:08.012Z",
"passed": true,
"duration": 771,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:08.530Z",
"passed": true,
"duration": 514,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:09.157Z",
"passed": true,
"duration": 624,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:09.880Z",
"passed": true,
"duration": 721,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:10.395Z",
"passed": true,
"duration": 513,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:11.292Z",
"passed": true,
"duration": 895,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:11.514Z",
"passed": false,
"duration": 220,
"reason": "Model returned empty response",
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"Yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:15.879Z",
"passed": true,
"duration": 4358,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:39.121Z",
"passed": true,
"duration": 1838,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:39.863Z",
"passed": true,
"duration": 738,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:56:43.097Z",
"passed": true,
"duration": 3231,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:44.836Z",
"passed": true,
"duration": 1737,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:45.465Z",
"passed": true,
"duration": 626,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:15.716Z",
"passed": true,
"duration": 2024,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:16.361Z",
"passed": true,
"duration": 641,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:20.162Z",
"passed": true,
"duration": 3798,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:21.917Z",
"passed": true,
"duration": 1752,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:22.504Z",
"passed": true,
"duration": 585,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:25.779Z",
"passed": true,
"duration": 3272,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:27.557Z",
"passed": true,
"duration": 1775,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:28.041Z",
"passed": true,
"duration": 481,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:31.450Z",
"passed": true,
"duration": 3406,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:37.473Z",
"passed": true,
"duration": 6020,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:42.394Z",
"passed": true,
"duration": 4917,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:47.544Z",
"passed": false,
"duration": 5147,
"reason": "Model returned empty response",
"category": "basic"
}
],
"highscores": [
{
"test": "addition",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 641,
"duration_secs": 0.641
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 771,
"duration_secs": 0.771
}
]
},
{
"test": "multiplication",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 585,
"duration_secs": 0.585
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 624,
"duration_secs": 0.624
}
]
},
{
"test": "division",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 481,
"duration_secs": 0.481
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 513,
"duration_secs": 0.513
}
]
},
{
"test": "web_content",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 220,
"duration_secs": 0.22
},
{
"model": "openai/gpt-4o-mini",
"duration": 4917,
"duration_secs": 4.917
}
]
}
],
"lastUpdated": "2025-06-05T21:19:47.545Z"
}