mono/packages/kbot/tests/unit/reports/basic.json
2025-06-28 10:37:04 +02:00

584 lines
16 KiB
JSON

{
"results": [
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:08.012Z",
"passed": true,
"duration": 771,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:08.530Z",
"passed": true,
"duration": 514,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:09.157Z",
"passed": true,
"duration": 624,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:09.880Z",
"passed": true,
"duration": 721,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:10.395Z",
"passed": true,
"duration": 513,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:11.292Z",
"passed": true,
"duration": 895,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:11.514Z",
"passed": false,
"duration": 220,
"reason": "Model returned empty response",
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"Yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:15.879Z",
"passed": true,
"duration": 4358,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:39.121Z",
"passed": true,
"duration": 1838,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:39.863Z",
"passed": true,
"duration": 738,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:56:43.097Z",
"passed": true,
"duration": 3231,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:44.836Z",
"passed": true,
"duration": 1737,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:45.465Z",
"passed": true,
"duration": 626,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:15.716Z",
"passed": true,
"duration": 2024,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:16.361Z",
"passed": true,
"duration": 641,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:20.162Z",
"passed": true,
"duration": 3798,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:21.917Z",
"passed": true,
"duration": 1752,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:22.504Z",
"passed": true,
"duration": 585,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:25.779Z",
"passed": true,
"duration": 3272,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:27.557Z",
"passed": true,
"duration": 1775,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:28.041Z",
"passed": true,
"duration": 481,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:31.450Z",
"passed": true,
"duration": 3406,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T21:19:37.473Z",
"passed": true,
"duration": 6020,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T21:19:42.394Z",
"passed": true,
"duration": 4917,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T21:19:47.544Z",
"passed": false,
"duration": 5147,
"reason": "Model returned empty response",
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:54:48.966Z",
"passed": true,
"duration": 1522,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:54:49.606Z",
"passed": true,
"duration": 634,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:54:53.004Z",
"passed": true,
"duration": 3394,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:54:53.710Z",
"passed": true,
"duration": 702,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:54:56.480Z",
"passed": true,
"duration": 2765,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"The result of multiplying 8 and 3 is \\boxed{24}."
],
"expected": "24",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:54:59.909Z",
"passed": false,
"duration": 3425,
"reason": "Expected 24, but got The result of multiplying 8 and 3 is \\boxed{24}.",
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:55:01.169Z",
"passed": true,
"duration": 1252,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:55:01.737Z",
"passed": true,
"duration": 564,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:55:06.362Z",
"passed": true,
"duration": 4619,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"Looking through the table of contents in the Wikipedia article on Kenya, I can see that there is indeed a section titled \"Prehistory\" under the History section.\n\nyes"
],
"expected": "yes",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:55:12.528Z",
"passed": false,
"duration": 6161,
"reason": "Expected yes, but got Looking through the table of contents in the Wikipedia article on Kenya, I can see that there is indeed a section titled \"Prehistory\" under the History section.\n\nyes",
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [
"yes"
],
"expected": "yes",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:55:18.757Z",
"passed": true,
"duration": 6225,
"category": "basic"
},
{
"test": "web_content",
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
"result": [],
"expected": "yes",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:55:25.642Z",
"passed": false,
"duration": 6879,
"reason": "Model returned empty response",
"category": "basic"
}
],
"highscores": [
{
"test": "addition",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 634,
"duration_secs": 0.634
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 771,
"duration_secs": 0.771
}
]
},
{
"test": "multiplication",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 624,
"duration_secs": 0.624
},
{
"model": "anthropic/claude-sonnet-4",
"duration": 702,
"duration_secs": 0.702
}
]
},
{
"test": "division",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 513,
"duration_secs": 0.513
},
{
"model": "openai/gpt-4o-mini",
"duration": 564,
"duration_secs": 0.564
}
]
},
{
"test": "web_content",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 220,
"duration_secs": 0.22
},
{
"model": "anthropic/claude-sonnet-4",
"duration": 6161,
"duration_secs": 6.161
}
]
}
],
"lastUpdated": "2025-06-05T22:55:25.642Z"
}