2265 lines
68 KiB
JSON
2265 lines
68 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:09:36.865Z",
|
|
"passed": false,
|
|
"duration": 1944,
|
|
"reason": "Expected -3,-2, but got -2,-3"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3,-2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:09:47.481Z",
|
|
"passed": true,
|
|
"duration": 10608
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:09:49.153Z",
|
|
"passed": true,
|
|
"duration": 1671
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [],
|
|
"expected": "120",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:03.043Z",
|
|
"passed": false,
|
|
"duration": 13889,
|
|
"reason": "Model returned empty response"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:03.988Z",
|
|
"passed": true,
|
|
"duration": 943
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:05.723Z",
|
|
"passed": false,
|
|
"duration": 1734,
|
|
"reason": "Expected 8, but got 5"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:07.465Z",
|
|
"passed": true,
|
|
"duration": 1739
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:13.671Z",
|
|
"passed": true,
|
|
"duration": 6205
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:14.967Z",
|
|
"passed": true,
|
|
"duration": 1295
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:10:20.932Z",
|
|
"passed": true,
|
|
"duration": 5964
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:13:10.276Z",
|
|
"passed": false,
|
|
"duration": 1242,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:13:31.650Z",
|
|
"passed": false,
|
|
"duration": 21368,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:13:34.699Z",
|
|
"passed": true,
|
|
"duration": 3046,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:13:45.957Z",
|
|
"passed": true,
|
|
"duration": 11256,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:13:47.935Z",
|
|
"passed": true,
|
|
"duration": 1976,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:14:07.714Z",
|
|
"passed": false,
|
|
"duration": 19778,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:14:08.883Z",
|
|
"passed": true,
|
|
"duration": 1167,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:14:12.225Z",
|
|
"passed": true,
|
|
"duration": 3341,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:14:12.889Z",
|
|
"passed": true,
|
|
"duration": 663,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:14:32.527Z",
|
|
"passed": true,
|
|
"duration": 19636,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:10.419Z",
|
|
"passed": false,
|
|
"duration": 1650,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3,-2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:20.647Z",
|
|
"passed": true,
|
|
"duration": 10222,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:21.643Z",
|
|
"passed": false,
|
|
"duration": 994,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:22.524Z",
|
|
"passed": false,
|
|
"duration": 878,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:23.496Z",
|
|
"passed": true,
|
|
"duration": 970,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:28.452Z",
|
|
"passed": true,
|
|
"duration": 4954,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:29.325Z",
|
|
"passed": true,
|
|
"duration": 872,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:30.109Z",
|
|
"passed": true,
|
|
"duration": 782,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:33.902Z",
|
|
"passed": true,
|
|
"duration": 3791,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:46.225Z",
|
|
"passed": false,
|
|
"duration": 12322,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:47.202Z",
|
|
"passed": false,
|
|
"duration": 974,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:48.005Z",
|
|
"passed": true,
|
|
"duration": 800,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:48.763Z",
|
|
"passed": true,
|
|
"duration": 756,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:55.510Z",
|
|
"passed": true,
|
|
"duration": 6745,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:56.297Z",
|
|
"passed": true,
|
|
"duration": 785,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:57.051Z",
|
|
"passed": true,
|
|
"duration": 751,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:22:58.294Z",
|
|
"passed": true,
|
|
"duration": 1241,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:23:04.551Z",
|
|
"passed": false,
|
|
"duration": 6255,
|
|
"reason": "Model returned empty response",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:23:05.297Z",
|
|
"passed": true,
|
|
"duration": 743,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:23:06.018Z",
|
|
"passed": true,
|
|
"duration": 719,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:24:32.237Z",
|
|
"passed": false,
|
|
"duration": 1533,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3, -2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:24:50.178Z",
|
|
"passed": false,
|
|
"duration": 17934,
|
|
"reason": "Expected -3,-2, but got -3, -2",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:24:51.040Z",
|
|
"passed": false,
|
|
"duration": 859,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openrouter",
|
|
"timestamp": "2025-04-02T11:24:51.938Z",
|
|
"passed": false,
|
|
"duration": 895,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:22:28.224Z",
|
|
"passed": false,
|
|
"duration": 1311,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:22:58.238Z",
|
|
"passed": false,
|
|
"duration": 30008,
|
|
"error": {
|
|
"message": "API call timed out",
|
|
"code": "UNKNOWN",
|
|
"type": "Error",
|
|
"details": {
|
|
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:137:33)\n at listOnTimeout (node:internal/timers:594:17)\n at processTimers (node:internal/timers:529:7)",
|
|
"message": "API call timed out"
|
|
}
|
|
},
|
|
"reason": "API call timed out",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:22:59.263Z",
|
|
"passed": false,
|
|
"duration": 1022,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3, -2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:23:00.561Z",
|
|
"passed": false,
|
|
"duration": 1294,
|
|
"reason": "Expected -3,-2, but got -3, -2",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:23:17.772Z",
|
|
"passed": false,
|
|
"duration": 17208,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:23:28.819Z",
|
|
"passed": false,
|
|
"duration": 11043,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:23:30.914Z",
|
|
"passed": true,
|
|
"duration": 2093,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:23:36.265Z",
|
|
"passed": true,
|
|
"duration": 5349,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:23:37.084Z",
|
|
"passed": true,
|
|
"duration": 816,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:23:38.020Z",
|
|
"passed": true,
|
|
"duration": 934,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:23:49.147Z",
|
|
"passed": true,
|
|
"duration": 11125,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:23:52.904Z",
|
|
"passed": true,
|
|
"duration": 3755,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:23:54.223Z",
|
|
"passed": true,
|
|
"duration": 1316,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:24:24.234Z",
|
|
"passed": false,
|
|
"duration": 30009,
|
|
"error": {
|
|
"message": "API call timed out",
|
|
"code": "UNKNOWN",
|
|
"type": "Error",
|
|
"details": {
|
|
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:137:33)\n at listOnTimeout (node:internal/timers:594:17)\n at processTimers (node:internal/timers:529:7)",
|
|
"message": "API call timed out"
|
|
}
|
|
},
|
|
"reason": "API call timed out",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:24:25.494Z",
|
|
"passed": false,
|
|
"duration": 1257,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:24:26.272Z",
|
|
"passed": true,
|
|
"duration": 776,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:24:56.284Z",
|
|
"passed": false,
|
|
"duration": 30009,
|
|
"error": {
|
|
"message": "API call timed out",
|
|
"code": "UNKNOWN",
|
|
"type": "Error",
|
|
"details": {
|
|
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:137:33)\n at listOnTimeout (node:internal/timers:594:17)\n at processTimers (node:internal/timers:529:7)",
|
|
"message": "API call timed out"
|
|
}
|
|
},
|
|
"reason": "API call timed out",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"The 6th number in the Fibonacci sequence is 5."
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:25:06.273Z",
|
|
"passed": false,
|
|
"duration": 9986,
|
|
"reason": "Expected 8, but got the 6th number in the fibonacci sequence is 5.",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:25:07.072Z",
|
|
"passed": true,
|
|
"duration": 795,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:25:12.207Z",
|
|
"passed": true,
|
|
"duration": 5133,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:25:13.308Z",
|
|
"passed": true,
|
|
"duration": 1099,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:25:14.724Z",
|
|
"passed": true,
|
|
"duration": 1414,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:25:24.633Z",
|
|
"passed": true,
|
|
"duration": 9907,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:25:28.285Z",
|
|
"passed": true,
|
|
"duration": 3650,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:25:31.067Z",
|
|
"passed": true,
|
|
"duration": 2779,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:25:35.904Z",
|
|
"passed": true,
|
|
"duration": 4834,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:25:36.993Z",
|
|
"passed": true,
|
|
"duration": 1087,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:25:37.796Z",
|
|
"passed": true,
|
|
"duration": 800,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:25:48.323Z",
|
|
"passed": true,
|
|
"duration": 10524,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"The result of 2 raised to the power of 3 is 8.\n\nAnswer: 8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:26:02.730Z",
|
|
"passed": false,
|
|
"duration": 14405,
|
|
"reason": "Expected 8, but got the result of 2 raised to the power of 3 is 8.\n\nanswer: 8",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:30:35.457Z",
|
|
"passed": false,
|
|
"duration": 3064,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3,-2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:30:47.981Z",
|
|
"passed": true,
|
|
"duration": 12517,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:30:49.084Z",
|
|
"passed": false,
|
|
"duration": 1100,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:30:50.009Z",
|
|
"passed": false,
|
|
"duration": 921,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:31:24.046Z",
|
|
"passed": false,
|
|
"duration": 2341,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:32:31.578Z",
|
|
"passed": false,
|
|
"duration": 3137,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:32:51.448Z",
|
|
"passed": false,
|
|
"duration": 1892,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3,-2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:32:58.250Z",
|
|
"passed": true,
|
|
"duration": 6795,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:32:59.105Z",
|
|
"passed": false,
|
|
"duration": 853,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:32:59.942Z",
|
|
"passed": false,
|
|
"duration": 832,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:33:19.796Z",
|
|
"passed": false,
|
|
"duration": 19850,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"The solutions to the quadratic equation x² + 5x + 6 = 0 are x = -2 and x = -3.\n\n-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:33:35.611Z",
|
|
"passed": false,
|
|
"duration": 15811,
|
|
"reason": "Expected -3,-2, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are x = -2 and x = -3.\n\n-2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:33:37.469Z",
|
|
"passed": true,
|
|
"duration": 1853,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:33:44.364Z",
|
|
"passed": true,
|
|
"duration": 6892,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:33:45.323Z",
|
|
"passed": true,
|
|
"duration": 956,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:33:46.153Z",
|
|
"passed": true,
|
|
"duration": 827,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:33:57.349Z",
|
|
"passed": true,
|
|
"duration": 11193,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:34:02.166Z",
|
|
"passed": true,
|
|
"duration": 4814,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:34:04.174Z",
|
|
"passed": true,
|
|
"duration": 2004,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:34:05.686Z",
|
|
"passed": false,
|
|
"duration": 1509,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:34:07.363Z",
|
|
"passed": true,
|
|
"duration": 1673,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:34:08.909Z",
|
|
"passed": true,
|
|
"duration": 1543,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:34:38.921Z",
|
|
"passed": false,
|
|
"duration": 30009,
|
|
"error": {
|
|
"message": "API call timed out",
|
|
"code": "UNKNOWN",
|
|
"type": "Error",
|
|
"details": {
|
|
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:137:33)\n at listOnTimeout (node:internal/timers:594:17)\n at processTimers (node:internal/timers:529:7)",
|
|
"message": "API call timed out"
|
|
}
|
|
},
|
|
"reason": "API call timed out",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:34:44.095Z",
|
|
"passed": false,
|
|
"duration": 5171,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:34:46.111Z",
|
|
"passed": true,
|
|
"duration": 2012,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:34:52.001Z",
|
|
"passed": true,
|
|
"duration": 5888,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:34:52.968Z",
|
|
"passed": true,
|
|
"duration": 964,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:34:54.051Z",
|
|
"passed": true,
|
|
"duration": 1080,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:35:04.364Z",
|
|
"passed": true,
|
|
"duration": 10309,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:35:10.480Z",
|
|
"passed": true,
|
|
"duration": 6114,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:35:11.619Z",
|
|
"passed": true,
|
|
"duration": 1136,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:35:19.194Z",
|
|
"passed": true,
|
|
"duration": 7572,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:35:20.455Z",
|
|
"passed": true,
|
|
"duration": 1259,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:35:21.956Z",
|
|
"passed": true,
|
|
"duration": 1498,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:35:27.372Z",
|
|
"passed": true,
|
|
"duration": 5414,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:35:37.266Z",
|
|
"passed": true,
|
|
"duration": 9891,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:36:22.644Z",
|
|
"passed": false,
|
|
"duration": 1358,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:36:31.987Z",
|
|
"passed": false,
|
|
"duration": 1375,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:37:00.757Z",
|
|
"passed": false,
|
|
"duration": 1589,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:37:38.502Z",
|
|
"passed": false,
|
|
"duration": 2344,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:37:51.538Z",
|
|
"passed": false,
|
|
"duration": 2010,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:37:59.511Z",
|
|
"passed": false,
|
|
"duration": 1399,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:39:33.082Z",
|
|
"passed": false,
|
|
"duration": 1792,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:40:21.065Z",
|
|
"passed": false,
|
|
"duration": 1221,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:40:47.110Z",
|
|
"passed": false,
|
|
"duration": 2367,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3, -2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:40:56.377Z",
|
|
"passed": false,
|
|
"duration": 9259,
|
|
"reason": "Expected -3,-2, but got -3, -2",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:40:57.391Z",
|
|
"passed": false,
|
|
"duration": 1010,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:40:58.209Z",
|
|
"passed": false,
|
|
"duration": 815,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:41:02.148Z",
|
|
"passed": false,
|
|
"duration": 3934,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"The solutions to the equation x² + 5x + 6 = 0 are -2 and -3.\n\n-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:41:10.265Z",
|
|
"passed": false,
|
|
"duration": 8112,
|
|
"reason": "Expected -3,-2, but got the solutions to the equation x² + 5x + 6 = 0 are -2 and -3.\n\n-2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:41:11.723Z",
|
|
"passed": true,
|
|
"duration": 1454,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:41:21.969Z",
|
|
"passed": true,
|
|
"duration": 10242,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:41:22.848Z",
|
|
"passed": true,
|
|
"duration": 876,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:41:23.636Z",
|
|
"passed": true,
|
|
"duration": 785,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:41:30.847Z",
|
|
"passed": true,
|
|
"duration": 7208,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Return only the number, no explanation.",
|
|
"result": [
|
|
"5! is equal to 120. \n\nAnswer: 120"
|
|
],
|
|
"expected": "120",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:41:34.615Z",
|
|
"passed": false,
|
|
"duration": 3765,
|
|
"reason": "Expected 120, but got 5! is equal to 120. \n\nanswer: 120",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:41:36.384Z",
|
|
"passed": true,
|
|
"duration": 1765,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:41:56.369Z",
|
|
"passed": false,
|
|
"duration": 19981,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:41:57.522Z",
|
|
"passed": true,
|
|
"duration": 1149,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:41:58.423Z",
|
|
"passed": true,
|
|
"duration": 898,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:42:28.442Z",
|
|
"passed": false,
|
|
"duration": 30014,
|
|
"error": {
|
|
"message": "API call timed out",
|
|
"code": "UNKNOWN",
|
|
"type": "Error",
|
|
"details": {
|
|
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:138:33)\n at listOnTimeout (node:internal/timers:594:17)\n at processTimers (node:internal/timers:529:7)",
|
|
"message": "API call timed out"
|
|
}
|
|
},
|
|
"reason": "API call timed out",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.",
|
|
"result": [
|
|
"5"
|
|
],
|
|
"expected": "8",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:42:33.805Z",
|
|
"passed": false,
|
|
"duration": 5358,
|
|
"reason": "Expected 8, but got 5",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:42:34.628Z",
|
|
"passed": true,
|
|
"duration": 819,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:42:35.522Z",
|
|
"passed": true,
|
|
"duration": 890,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:42:36.791Z",
|
|
"passed": true,
|
|
"duration": 1266,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:42:37.938Z",
|
|
"passed": true,
|
|
"duration": 1144,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "deepseek/deepseek-r1",
|
|
"router": "deepseek/deepseek-r1",
|
|
"timestamp": "2025-04-02T13:42:42.967Z",
|
|
"passed": true,
|
|
"duration": 5026,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Return only the number, no explanation.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-02T13:42:51.135Z",
|
|
"passed": true,
|
|
"duration": 8165,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:42:52.650Z",
|
|
"passed": true,
|
|
"duration": 1512,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:42:57.896Z",
|
|
"passed": true,
|
|
"duration": 5243,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:42:58.774Z",
|
|
"passed": true,
|
|
"duration": 875,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Return only the number, no explanation.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:42:59.643Z",
|
|
"passed": true,
|
|
"duration": 866,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"router": "anthropic/claude-3.5-sonnet",
|
|
"timestamp": "2025-04-02T13:43:13.962Z",
|
|
"passed": false,
|
|
"duration": 1435,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "qwen/qwq-32b",
|
|
"router": "qwen/qwq-32b",
|
|
"timestamp": "2025-04-02T13:43:39.174Z",
|
|
"passed": false,
|
|
"duration": 25202,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-2, -3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-02T13:43:40.474Z",
|
|
"passed": false,
|
|
"duration": 1295,
|
|
"reason": "Expected -3,-2, but got -2, -3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.",
|
|
"result": [
|
|
"-3, -2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-02T13:43:41.957Z",
|
|
"passed": false,
|
|
"duration": 1478,
|
|
"reason": "Expected -3,-2, but got -3, -2",
|
|
"category": "math"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "quadratic",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 1295,
|
|
"duration_secs": 1.295
|
|
},
|
|
{
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"duration": 1435,
|
|
"duration_secs": 1.435
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 785,
|
|
"duration_secs": 0.785
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 876,
|
|
"duration_secs": 0.876
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 898,
|
|
"duration_secs": 0.898
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 1149,
|
|
"duration_secs": 1.149
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"rankings": [
|
|
{
|
|
"model": "anthropic/claude-3.5-sonnet",
|
|
"duration": 819,
|
|
"duration_secs": 0.819
|
|
},
|
|
{
|
|
"model": "qwen/qwq-32b",
|
|
"duration": 890,
|
|
"duration_secs": 0.89
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "power",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 866,
|
|
"duration_secs": 0.866
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 875,
|
|
"duration_secs": 0.875
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2025-04-02T13:43:41.958Z"
|
|
} |