mono/packages/kbot/tests/unit/reports/math.json
2025-04-02 13:27:47 +02:00

700 lines
19 KiB
JSON

{
"results": [
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:09:36.865Z",
"passed": false,
"duration": 1944,
"reason": "Expected -3,-2, but got -2,-3"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-3,-2"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:09:47.481Z",
"passed": true,
"duration": 10608
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:09:49.153Z",
"passed": true,
"duration": 1671
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [],
"expected": "120",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:03.043Z",
"passed": false,
"duration": 13889,
"reason": "Model returned empty response"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:03.988Z",
"passed": true,
"duration": 943
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:05.723Z",
"passed": false,
"duration": 1734,
"reason": "Expected 8, but got 5"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:07.465Z",
"passed": true,
"duration": 1739
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:13.671Z",
"passed": true,
"duration": 6205
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:14.967Z",
"passed": true,
"duration": 1295
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:10:20.932Z",
"passed": true,
"duration": 5964
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:10.276Z",
"passed": false,
"duration": 1242,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:31.650Z",
"passed": false,
"duration": 21368,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:34.699Z",
"passed": true,
"duration": 3046,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:45.957Z",
"passed": true,
"duration": 11256,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:13:47.935Z",
"passed": true,
"duration": 1976,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:07.714Z",
"passed": false,
"duration": 19778,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:08.883Z",
"passed": true,
"duration": 1167,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:12.225Z",
"passed": true,
"duration": 3341,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:12.889Z",
"passed": true,
"duration": 663,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:14:32.527Z",
"passed": true,
"duration": 19636,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:10.419Z",
"passed": false,
"duration": 1650,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-3,-2"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:20.647Z",
"passed": true,
"duration": 10222,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:21.643Z",
"passed": false,
"duration": 994,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:22.524Z",
"passed": false,
"duration": 878,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:23.496Z",
"passed": true,
"duration": 970,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:28.452Z",
"passed": true,
"duration": 4954,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:29.325Z",
"passed": true,
"duration": 872,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:30.109Z",
"passed": true,
"duration": 782,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:33.902Z",
"passed": true,
"duration": 3791,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:46.225Z",
"passed": false,
"duration": 12322,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:47.202Z",
"passed": false,
"duration": 974,
"reason": "Expected 8, but got 5",
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:48.005Z",
"passed": true,
"duration": 800,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:48.763Z",
"passed": true,
"duration": 756,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:55.510Z",
"passed": true,
"duration": 6745,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:56.297Z",
"passed": true,
"duration": 785,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:57.051Z",
"passed": true,
"duration": 751,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:22:58.294Z",
"passed": true,
"duration": 1241,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:23:04.551Z",
"passed": false,
"duration": 6255,
"reason": "Model returned empty response",
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:23:05.297Z",
"passed": true,
"duration": 743,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:23:06.018Z",
"passed": true,
"duration": 719,
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "anthropic/claude-3.5-sonnet",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:32.237Z",
"passed": false,
"duration": 1533,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-3, -2"
],
"expected": "-3,-2",
"model": "qwen/qwq-32b",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:50.178Z",
"passed": false,
"duration": 17934,
"reason": "Expected -3,-2, but got -3, -2",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-4o-mini",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:51.040Z",
"passed": false,
"duration": 859,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
"result": [
"-2, -3"
],
"expected": "-3,-2",
"model": "openai/gpt-3.5-turbo",
"router": "openrouter",
"timestamp": "2025-04-02T11:24:51.938Z",
"passed": false,
"duration": 895,
"reason": "Expected -3,-2, but got -2, -3",
"category": "math"
}
],
"highscores": [
{
"test": "quadratic",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 859,
"duration_secs": 0.859
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 895,
"duration_secs": 0.895
}
]
},
{
"test": "factorial",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 782,
"duration_secs": 0.782
},
{
"model": "openai/gpt-4o-mini",
"duration": 872,
"duration_secs": 0.872
}
]
},
{
"test": "fibonacci",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 800,
"duration_secs": 0.8
},
{
"model": "openai/gpt-4o-mini",
"duration": 974,
"duration_secs": 0.974
}
]
},
{
"test": "square_root",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 751,
"duration_secs": 0.751
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 756,
"duration_secs": 0.756
}
]
},
{
"test": "power",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 719,
"duration_secs": 0.719
},
{
"model": "openai/gpt-4o-mini",
"duration": 743,
"duration_secs": 0.743
}
]
}
],
"lastUpdated": "2025-04-02T11:24:51.939Z"
}