mono/packages/kbot/tests/unit/reports/math.json

223 lines
5.9 KiB
JSON

{
"results": [
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
"result": [
"-2,-3"
],
"expected": "-3,-2",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:07.920Z",
"passed": false,
"duration": 1202,
"reason": "Expected -3,-2, but got -2,-3",
"category": "math"
},
{
"test": "quadratic",
"prompt": "Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.",
"result": [
"-3,-2"
],
"expected": "-3,-2",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:09.012Z",
"passed": true,
"duration": 1088,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:09.517Z",
"passed": true,
"duration": 503,
"category": "math"
},
{
"test": "factorial",
"prompt": "Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.",
"result": [
"120"
],
"expected": "120",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:10.000Z",
"passed": true,
"duration": 481,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:10.505Z",
"passed": true,
"duration": 503,
"category": "math"
},
{
"test": "fibonacci",
"prompt": "Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:11.109Z",
"passed": true,
"duration": 601,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:11.849Z",
"passed": true,
"duration": 738,
"category": "math"
},
{
"test": "square_root",
"prompt": "Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.",
"result": [
"4"
],
"expected": "4",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:12.390Z",
"passed": true,
"duration": 539,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:12.985Z",
"passed": true,
"duration": 592,
"category": "math"
},
{
"test": "power",
"prompt": "Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:14.091Z",
"passed": true,
"duration": 1103,
"category": "math"
}
],
"highscores": [
{
"test": "quadratic",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 1088,
"duration_secs": 1.088
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 1202,
"duration_secs": 1.202
}
]
},
{
"test": "factorial",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 481,
"duration_secs": 0.481
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 503,
"duration_secs": 0.503
}
]
},
{
"test": "fibonacci",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 503,
"duration_secs": 0.503
},
{
"model": "openai/gpt-4o-mini",
"duration": 601,
"duration_secs": 0.601
}
]
},
{
"test": "square_root",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 539,
"duration_secs": 0.539
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 738,
"duration_secs": 0.738
}
]
},
{
"test": "power",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 592,
"duration_secs": 0.592
},
{
"model": "openai/gpt-4o-mini",
"duration": 1103,
"duration_secs": 1.103
}
]
}
],
"lastUpdated": "2025-06-05T18:46:14.092Z"
}