1304 lines
34 KiB
JSON
1304 lines
34 KiB
JSON
{
|
||
"results": [
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T10:56:09.502Z",
|
||
"passed": true,
|
||
"duration": 1237
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T10:56:13.802Z",
|
||
"passed": true,
|
||
"duration": 4298
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T10:56:15.214Z",
|
||
"passed": true,
|
||
"duration": 1411
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T10:56:18.337Z",
|
||
"passed": true,
|
||
"duration": 3122
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T10:56:18.922Z",
|
||
"passed": true,
|
||
"duration": 583
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T10:56:22.539Z",
|
||
"passed": true,
|
||
"duration": 3615
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:01:08.904Z",
|
||
"passed": true,
|
||
"duration": 1888
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:01:15.210Z",
|
||
"passed": true,
|
||
"duration": 6304
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:01:16.502Z",
|
||
"passed": true,
|
||
"duration": 1291
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:01:18.728Z",
|
||
"passed": true,
|
||
"duration": 2225
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:01:19.938Z",
|
||
"passed": true,
|
||
"duration": 1209
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:01:27.791Z",
|
||
"passed": true,
|
||
"duration": 7852
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:21.370Z",
|
||
"passed": true,
|
||
"duration": 1213,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:24.898Z",
|
||
"passed": true,
|
||
"duration": 3524,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:25.624Z",
|
||
"passed": true,
|
||
"duration": 724,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:26.630Z",
|
||
"passed": true,
|
||
"duration": 1005,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:27.812Z",
|
||
"passed": true,
|
||
"duration": 1178,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:31.317Z",
|
||
"passed": true,
|
||
"duration": 3503,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:32.288Z",
|
||
"passed": true,
|
||
"duration": 969,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:33.147Z",
|
||
"passed": true,
|
||
"duration": 858,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:33.724Z",
|
||
"passed": true,
|
||
"duration": 576,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:34.841Z",
|
||
"passed": true,
|
||
"duration": 1115,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:35.673Z",
|
||
"passed": true,
|
||
"duration": 831,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:16:36.762Z",
|
||
"passed": true,
|
||
"duration": 1087,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:25.749Z",
|
||
"passed": true,
|
||
"duration": 1644,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:31.261Z",
|
||
"passed": true,
|
||
"duration": 5507,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:32.131Z",
|
||
"passed": true,
|
||
"duration": 869,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:33.306Z",
|
||
"passed": true,
|
||
"duration": 1173,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:34.323Z",
|
||
"passed": true,
|
||
"duration": 1016,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:38.976Z",
|
||
"passed": true,
|
||
"duration": 4651,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:39.914Z",
|
||
"passed": true,
|
||
"duration": 937,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:41.053Z",
|
||
"passed": true,
|
||
"duration": 1137,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:42.918Z",
|
||
"passed": true,
|
||
"duration": 1863,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:47.234Z",
|
||
"passed": true,
|
||
"duration": 4314,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:47.966Z",
|
||
"passed": true,
|
||
"duration": 730,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:20:48.941Z",
|
||
"passed": true,
|
||
"duration": 973,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:15.745Z",
|
||
"passed": true,
|
||
"duration": 1951,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:19.476Z",
|
||
"passed": true,
|
||
"duration": 3726,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:20.854Z",
|
||
"passed": true,
|
||
"duration": 1376,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:28.044Z",
|
||
"passed": true,
|
||
"duration": 7188,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "deepseek/deepseek-r1",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:43.203Z",
|
||
"passed": true,
|
||
"duration": 15157,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:50.736Z",
|
||
"passed": true,
|
||
"duration": 7531,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:51.834Z",
|
||
"passed": true,
|
||
"duration": 1096,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:55.428Z",
|
||
"passed": true,
|
||
"duration": 3592,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:56.874Z",
|
||
"passed": true,
|
||
"duration": 1444,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:25:57.746Z",
|
||
"passed": true,
|
||
"duration": 870,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "deepseek/deepseek-r1",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:08.731Z",
|
||
"passed": true,
|
||
"duration": 10983,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:14.379Z",
|
||
"passed": true,
|
||
"duration": 5646,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:15.658Z",
|
||
"passed": true,
|
||
"duration": 1276,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:21.428Z",
|
||
"passed": true,
|
||
"duration": 5768,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:22.358Z",
|
||
"passed": true,
|
||
"duration": 929,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:23.155Z",
|
||
"passed": true,
|
||
"duration": 794,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "deepseek/deepseek-r1",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:38.566Z",
|
||
"passed": true,
|
||
"duration": 15409,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "openrouter",
|
||
"timestamp": "2025-04-02T11:26:40.358Z",
|
||
"passed": true,
|
||
"duration": 1790,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "anthropic/claude-3.5-sonnet",
|
||
"timestamp": "2025-04-02T13:44:06.429Z",
|
||
"passed": true,
|
||
"duration": 1689,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "qwen/qwq-32b",
|
||
"timestamp": "2025-04-02T13:44:10.240Z",
|
||
"passed": true,
|
||
"duration": 3807,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-02T13:44:11.128Z",
|
||
"passed": true,
|
||
"duration": 885,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-02T13:44:21.587Z",
|
||
"passed": true,
|
||
"duration": 10455,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "deepseek/deepseek-r1",
|
||
"router": "deepseek/deepseek-r1",
|
||
"timestamp": "2025-04-02T13:44:33.654Z",
|
||
"passed": true,
|
||
"duration": 12064,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"The sum of 5 and 3 is 8. Therefore, the result is \\boxed{8}."
|
||
],
|
||
"expected": "8",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-02T13:44:40.062Z",
|
||
"passed": false,
|
||
"duration": 6405,
|
||
"reason": "Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \\boxed{8}.",
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "anthropic/claude-3.5-sonnet",
|
||
"timestamp": "2025-04-02T13:44:41.261Z",
|
||
"passed": true,
|
||
"duration": 1190,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "qwen/qwq-32b",
|
||
"timestamp": "2025-04-02T13:44:46.272Z",
|
||
"passed": true,
|
||
"duration": 5008,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-02T13:44:47.386Z",
|
||
"passed": true,
|
||
"duration": 1111,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-02T13:44:48.372Z",
|
||
"passed": true,
|
||
"duration": 984,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24\n\n24\n\nThe result is 24.\n\n24\n\nHere's the answer: 24\n\nThe answer will be 24.\n\n24\n\n24\n\nThe product of 8 and 3 is 24.\n\n24\n\nThe answer is 24.\n\n24\n\n24\n\n24\n\nThe result is 24.\n\n24\n\nHere's the numerical result: 24\n\nThe answer is 24.\n\n24\n\n24\n\nThe answer is 24."
|
||
],
|
||
"expected": "24",
|
||
"model": "deepseek/deepseek-r1",
|
||
"router": "deepseek/deepseek-r1",
|
||
"timestamp": "2025-04-02T13:44:53.633Z",
|
||
"passed": false,
|
||
"duration": 5258,
|
||
"reason": "Expected 24, but got 24\n\n24\n\nthe result is 24.\n\n24\n\nhere's the answer: 24\n\nthe answer will be 24.\n\n24\n\n24\n\nthe product of 8 and 3 is 24.\n\n24\n\nthe answer is 24.\n\n24\n\n24\n\n24\n\nthe result is 24.\n\n24\n\nhere's the numerical result: 24\n\nthe answer is 24.\n\n24\n\n24\n\nthe answer is 24.",
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-02T13:44:55.196Z",
|
||
"passed": true,
|
||
"duration": 1558,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"router": "anthropic/claude-3.5-sonnet",
|
||
"timestamp": "2025-04-02T13:44:56.604Z",
|
||
"passed": true,
|
||
"duration": 1405,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "qwen/qwq-32b",
|
||
"router": "qwen/qwq-32b",
|
||
"timestamp": "2025-04-02T13:44:57.523Z",
|
||
"passed": true,
|
||
"duration": 917,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-02T13:44:58.630Z",
|
||
"passed": true,
|
||
"duration": 1104,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-02T13:44:59.523Z",
|
||
"passed": true,
|
||
"duration": 889,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "deepseek/deepseek-r1",
|
||
"router": "deepseek/deepseek-r1",
|
||
"timestamp": "2025-04-02T13:45:06.658Z",
|
||
"passed": true,
|
||
"duration": 7130,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-02T13:45:10.307Z",
|
||
"passed": true,
|
||
"duration": 3646,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-02T22:06:38.904Z",
|
||
"passed": true,
|
||
"duration": 2263,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-02T22:06:41.138Z",
|
||
"passed": true,
|
||
"duration": 2228,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-02T22:06:41.934Z",
|
||
"passed": true,
|
||
"duration": 791,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-02T22:06:43.239Z",
|
||
"passed": true,
|
||
"duration": 1300,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8 × 3 = 24"
|
||
],
|
||
"expected": "24",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-02T22:06:46.025Z",
|
||
"passed": false,
|
||
"duration": 2782,
|
||
"reason": "Expected 24, but got 8 × 3 = 24",
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-02T22:06:47.239Z",
|
||
"passed": true,
|
||
"duration": 1206,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-02T22:06:48.026Z",
|
||
"passed": true,
|
||
"duration": 783,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-02T22:06:51.012Z",
|
||
"passed": true,
|
||
"duration": 2982,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-02T22:06:51.777Z",
|
||
"passed": true,
|
||
"duration": 760,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-03T17:14:04.393Z",
|
||
"passed": true,
|
||
"duration": 1484,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-03T17:14:12.861Z",
|
||
"passed": true,
|
||
"duration": 8460,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "addition",
|
||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"8"
|
||
],
|
||
"expected": "8",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-03T17:14:13.779Z",
|
||
"passed": true,
|
||
"duration": 910,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-03T17:14:14.740Z",
|
||
"passed": true,
|
||
"duration": 955,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-03T17:14:22.399Z",
|
||
"passed": true,
|
||
"duration": 7653,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"24"
|
||
],
|
||
"expected": "24",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-03T17:14:23.502Z",
|
||
"passed": true,
|
||
"duration": 1095,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"router": "openai/gpt-3.5-turbo",
|
||
"timestamp": "2025-04-03T17:14:24.325Z",
|
||
"passed": true,
|
||
"duration": 816,
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"15 divided by 3 is 5. \n\nAnswer: 5"
|
||
],
|
||
"expected": "5",
|
||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||
"timestamp": "2025-04-03T17:14:40.987Z",
|
||
"passed": false,
|
||
"duration": 16655,
|
||
"reason": "Expected 5, but got 15 divided by 3 is 5. \n\nanswer: 5",
|
||
"category": "basic"
|
||
},
|
||
{
|
||
"test": "division",
|
||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||
"result": [
|
||
"5"
|
||
],
|
||
"expected": "5",
|
||
"model": "openai/gpt-4o-mini",
|
||
"router": "openai/gpt-4o-mini",
|
||
"timestamp": "2025-04-03T17:14:41.951Z",
|
||
"passed": true,
|
||
"duration": 954,
|
||
"category": "basic"
|
||
}
|
||
],
|
||
"highscores": [
|
||
{
|
||
"test": "addition",
|
||
"rankings": [
|
||
{
|
||
"model": "openai/gpt-4o-mini",
|
||
"duration": 910,
|
||
"duration_secs": 0.91
|
||
},
|
||
{
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"duration": 1484,
|
||
"duration_secs": 1.484
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"test": "multiplication",
|
||
"rankings": [
|
||
{
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"duration": 955,
|
||
"duration_secs": 0.955
|
||
},
|
||
{
|
||
"model": "openai/gpt-4o-mini",
|
||
"duration": 1095,
|
||
"duration_secs": 1.095
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"test": "division",
|
||
"rankings": [
|
||
{
|
||
"model": "openai/gpt-3.5-turbo",
|
||
"duration": 816,
|
||
"duration_secs": 0.816
|
||
},
|
||
{
|
||
"model": "qwen/qwq-32b",
|
||
"duration": 917,
|
||
"duration_secs": 0.917
|
||
}
|
||
]
|
||
}
|
||
],
|
||
"lastUpdated": "2025-04-03T17:14:41.951Z"
|
||
} |