436 lines
12 KiB
JSON
436 lines
12 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:07.920Z",
|
|
"passed": false,
|
|
"duration": 1202,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-3,-2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:09.012Z",
|
|
"passed": true,
|
|
"duration": 1088,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:09.517Z",
|
|
"passed": true,
|
|
"duration": 503,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:10.000Z",
|
|
"passed": true,
|
|
"duration": 481,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:10.505Z",
|
|
"passed": true,
|
|
"duration": 503,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:11.109Z",
|
|
"passed": true,
|
|
"duration": 601,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:11.849Z",
|
|
"passed": true,
|
|
"duration": 738,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:12.390Z",
|
|
"passed": true,
|
|
"duration": 539,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:12.985Z",
|
|
"passed": true,
|
|
"duration": 592,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:14.091Z",
|
|
"passed": true,
|
|
"duration": 1103,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:38:59.106Z",
|
|
"passed": false,
|
|
"duration": 1401,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2026-03-19T15:38:59.626Z",
|
|
"passed": false,
|
|
"duration": 514,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:00.750Z",
|
|
"passed": false,
|
|
"duration": 1120,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:01.739Z",
|
|
"passed": true,
|
|
"duration": 985,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2026-03-19T15:39:02.255Z",
|
|
"passed": true,
|
|
"duration": 512,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:03.136Z",
|
|
"passed": true,
|
|
"duration": 877,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:04.243Z",
|
|
"passed": true,
|
|
"duration": 1103,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2026-03-19T15:39:04.740Z",
|
|
"passed": true,
|
|
"duration": 494,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:08.836Z",
|
|
"passed": true,
|
|
"duration": 4093,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:10.351Z",
|
|
"passed": true,
|
|
"duration": 1512,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2026-03-19T15:39:10.838Z",
|
|
"passed": true,
|
|
"duration": 483,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:11.810Z",
|
|
"passed": true,
|
|
"duration": 969,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:13.154Z",
|
|
"passed": true,
|
|
"duration": 1340,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2026-03-19T15:39:14.466Z",
|
|
"passed": true,
|
|
"duration": 1308,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"router": "anthropic/claude-sonnet-4",
|
|
"timestamp": "2026-03-19T15:39:15.600Z",
|
|
"passed": true,
|
|
"duration": 1129,
|
|
"category": "math"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "quadratic",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 514,
|
|
"duration_secs": 0.514
|
|
},
|
|
{
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"duration": 1120,
|
|
"duration_secs": 1.12
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 503,
|
|
"duration_secs": 0.503
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 512,
|
|
"duration_secs": 0.512
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 494,
|
|
"duration_secs": 0.494
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 503,
|
|
"duration_secs": 0.503
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 483,
|
|
"duration_secs": 0.483
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 738,
|
|
"duration_secs": 0.738
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "power",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 592,
|
|
"duration_secs": 0.592
|
|
},
|
|
{
|
|
"model": "anthropic/claude-sonnet-4",
|
|
"duration": 1129,
|
|
"duration_secs": 1.129
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2026-03-19T15:39:15.600Z"
|
|
} |