584 lines
16 KiB
JSON
584 lines
16 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:08.012Z",
|
|
"passed": true,
|
|
"duration": 771,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:08.530Z",
|
|
"passed": true,
|
|
"duration": 514,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:09.157Z",
|
|
"passed": true,
|
|
"duration": 624,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:09.880Z",
|
|
"passed": true,
|
|
"duration": 721,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:10.395Z",
|
|
"passed": true,
|
|
"duration": 513,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:11.292Z",
|
|
"passed": true,
|
|
"duration": 895,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:11.514Z",
|
|
"passed": false,
|
|
"duration": 220,
|
|
"reason": "Model returned empty response",
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"Yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:15.879Z",
|
|
"passed": true,
|
|
"duration": 4358,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T18:56:39.121Z",
|
|
"passed": true,
|
|
"duration": 1838,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:56:39.863Z",
|
|
"passed": true,
|
|
"duration": 738,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T18:56:43.097Z",
|
|
"passed": true,
|
|
"duration": 3231,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T18:56:44.836Z",
|
|
"passed": true,
|
|
"duration": 1737,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:56:45.465Z",
|
|
"passed": true,
|
|
"duration": 626,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:15.716Z",
|
|
"passed": true,
|
|
"duration": 2024,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:16.361Z",
|
|
"passed": true,
|
|
"duration": 641,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:20.162Z",
|
|
"passed": true,
|
|
"duration": 3798,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:21.917Z",
|
|
"passed": true,
|
|
"duration": 1752,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:22.504Z",
|
|
"passed": true,
|
|
"duration": 585,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:25.779Z",
|
|
"passed": true,
|
|
"duration": 3272,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:27.557Z",
|
|
"passed": true,
|
|
"duration": 1775,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:28.041Z",
|
|
"passed": true,
|
|
"duration": 481,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:31.450Z",
|
|
"passed": true,
|
|
"duration": 3406,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T21:19:37.473Z",
|
|
"passed": true,
|
|
"duration": 6020,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T21:19:42.394Z",
|
|
"passed": true,
|
|
"duration": 4917,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [],
|
|
"expected": "yes",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T21:19:47.544Z",
|
|
"passed": false,
|
|
"duration": 5147,
|
|
"reason": "Model returned empty response",
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T22:54:48.966Z",
|
|
"passed": true,
|
|
"duration": 1522,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T22:54:49.606Z",
|
|
"passed": true,
|
|
"duration": 634,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "addition",
|
|
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T22:54:53.004Z",
|
|
"passed": true,
|
|
"duration": 3394,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T22:54:53.710Z",
|
|
"passed": true,
|
|
"duration": 702,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"24"
|
|
],
|
|
"expected": "24",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T22:54:56.480Z",
|
|
"passed": true,
|
|
"duration": 2765,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"The result of multiplying 8 and 3 is \\boxed{24}."
|
|
],
|
|
"expected": "24",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T22:54:59.909Z",
|
|
"passed": false,
|
|
"duration": 3425,
|
|
"reason": "Expected 24, but got The result of multiplying 8 and 3 is \\boxed{24}.",
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T22:55:01.169Z",
|
|
"passed": true,
|
|
"duration": 1252,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T22:55:01.737Z",
|
|
"passed": true,
|
|
"duration": 564,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "division",
|
|
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "5",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T22:55:06.362Z",
|
|
"passed": true,
|
|
"duration": 4619,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"Looking through the table of contents in the Wikipedia article on Kenya, I can see that there is indeed a section titled \"Prehistory\" under the History section.\n\nyes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2025-06-05T22:55:12.528Z",
|
|
"passed": false,
|
|
"duration": 6161,
|
|
"reason": "Expected yes, but got Looking through the table of contents in the Wikipedia article on Kenya, I can see that there is indeed a section titled \"Prehistory\" under the History section.\n\nyes",
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [
|
|
"yes"
|
|
],
|
|
"expected": "yes",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T22:55:18.757Z",
|
|
"passed": true,
|
|
"duration": 6225,
|
|
"category": "basic"
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"prompt": "Check if the content contains a section about Human prehistory. Reply with \"yes\" if it does, \"no\" if it does not.",
|
|
"result": [],
|
|
"expected": "yes",
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"router": "deepseek/deepseek-r1:free",
|
|
"timestamp": "2025-06-05T22:55:25.642Z",
|
|
"passed": false,
|
|
"duration": 6879,
|
|
"reason": "Model returned empty response",
|
|
"category": "basic"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "addition",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 634,
|
|
"duration_secs": 0.634
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 771,
|
|
"duration_secs": 0.771
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "multiplication",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 624,
|
|
"duration_secs": 0.624
|
|
},
|
|
{
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"duration": 702,
|
|
"duration_secs": 0.702
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "division",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 513,
|
|
"duration_secs": 0.513
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 564,
|
|
"duration_secs": 0.564
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "web_content",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 220,
|
|
"duration_secs": 0.22
|
|
},
|
|
{
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"duration": 6161,
|
|
"duration_secs": 6.161
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2025-06-05T22:55:25.642Z"
|
|
} |