415 lines
11 KiB
JSON
415 lines
11 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:08.012Z",
|
|
"passed": true,
|
|
"duration": 771,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:08.530Z",
|
|
"passed": true,
|
|
"duration": 514,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:09.157Z",
|
|
"passed": true,
|
|
"duration": 624,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:09.880Z",
|
|
"passed": true,
|
|
"duration": 721,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:10.395Z",
|
|
"passed": true,
|
|
"duration": 513,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:11.292Z",
|
|
"passed": true,
|
|
"duration": 895,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:11.514Z",
|
|
"passed": false,
|
|
"duration": 220,
|
|
"reason": "Model returned empty response",
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"Yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:15.879Z",
|
|
"passed": true,
|
|
"duration": 4358,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T18:56:39.121Z",
|
|
"passed": true,
|
|
"duration": 1838,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:56:39.863Z",
|
|
"passed": true,
|
|
"duration": 738,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T18:56:43.097Z",
|
|
"passed": true,
|
|
"duration": 3231,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T18:56:44.836Z",
|
|
"passed": true,
|
|
"duration": 1737,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:56:45.465Z",
|
|
"passed": true,
|
|
"duration": 626,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:15.716Z",
|
|
"passed": true,
|
|
"duration": 2024,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:16.361Z",
|
|
"passed": true,
|
|
"duration": 641,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:20.162Z",
|
|
"passed": true,
|
|
"duration": 3798,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:21.917Z",
|
|
"passed": true,
|
|
"duration": 1752,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:22.504Z",
|
|
"passed": true,
|
|
"duration": 585,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:25.779Z",
|
|
"passed": true,
|
|
"duration": 3272,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:27.557Z",
|
|
"passed": true,
|
|
"duration": 1775,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:28.041Z",
|
|
"passed": true,
|
|
"duration": 481,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:31.450Z",
|
|
"passed": true,
|
|
"duration": 3406,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:37.473Z",
|
|
"passed": true,
|
|
"duration": 6020,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:42.394Z",
|
|
"passed": true,
|
|
"duration": 4917,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [],
|
|
"expected": "yes",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:47.544Z",
|
|
"passed": false,
|
|
"duration": 5147,
|
|
"reason": "Model returned empty response",
|
|
"category": "basic"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "addition",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 641,
|
|
"duration_secs": 0.641
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 771,
|
|
"duration_secs": 0.771
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 585,
|
|
"duration_secs": 0.585
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 624,
|
|
"duration_secs": 0.624
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "division",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 481,
|
|
"duration_secs": 0.481
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 513,
|
|
"duration_secs": 0.513
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 220,
|
|
"duration_secs": 0.22
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 4917,
|
|
"duration_secs": 4.917
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2025-06-05T21:19:47.545Z"
|
|
} |