223 lines
5.9 KiB
JSON
223 lines
5.9 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-2,-3"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:07.920Z",
|
|
"passed": false,
|
|
"duration": 1202,
|
|
"reason": "Expected -3,-2, but got -2,-3",
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "quadratic",
|
|
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
|
|
"result": [
|
|
"-3,-2"
|
|
],
|
|
"expected": "-3,-2",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:09.012Z",
|
|
"passed": true,
|
|
"duration": 1088,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:09.517Z",
|
|
"passed": true,
|
|
"duration": 503,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
|
|
"result": [
|
|
"120"
|
|
],
|
|
"expected": "120",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:10.000Z",
|
|
"passed": true,
|
|
"duration": 481,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:10.505Z",
|
|
"passed": true,
|
|
"duration": 503,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:11.109Z",
|
|
"passed": true,
|
|
"duration": 601,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:11.849Z",
|
|
"passed": true,
|
|
"duration": 738,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"4"
|
|
],
|
|
"expected": "4",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:12.390Z",
|
|
"passed": true,
|
|
"duration": 539,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-06-05T18:46:12.985Z",
|
|
"passed": true,
|
|
"duration": 592,
|
|
"category": "math"
|
|
},
|
|
{
|
|
"test": "power",
|
|
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
|
|
"result": [
|
|
"8"
|
|
],
|
|
"expected": "8",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-06-05T18:46:14.091Z",
|
|
"passed": true,
|
|
"duration": 1103,
|
|
"category": "math"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "quadratic",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 1088,
|
|
"duration_secs": 1.088
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 1202,
|
|
"duration_secs": 1.202
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "factorial",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 481,
|
|
"duration_secs": 0.481
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 503,
|
|
"duration_secs": 0.503
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "fibonacci",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 503,
|
|
"duration_secs": 0.503
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 601,
|
|
"duration_secs": 0.601
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "square_root",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 539,
|
|
"duration_secs": 0.539
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 738,
|
|
"duration_secs": 0.738
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "power",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 592,
|
|
"duration_secs": 0.592
|
|
},
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 1103,
|
|
"duration_secs": 1.103
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2025-06-05T18:46:14.092Z"
|
|
} |