219 lines
6.4 KiB
JSON
219 lines
6.4 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "tool-add",
|
|
"prompt": "Use the add tool to calculate 15 plus 27. Return the result.",
|
|
"result": [
|
|
"{\"name\": \"add\", \"arguments\": {\"a\": 15, \"b\": 27}}\n</tool_call>\n"
|
|
],
|
|
"expected": "42",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:29:19.843Z",
|
|
"passed": false,
|
|
"duration": 5223,
|
|
"reason": "Expected 42, but got {\"name\": \"add\", \"arguments\": {\"a\": 15, \"b\": 27}}\n</tool_call>",
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-multiply",
|
|
"prompt": "Use the multiply tool to calculate 8 times 9. Return the result.",
|
|
"result": [
|
|
"{\"name\": \"multiply\", \"arguments\": {\"a\": 8, \"b\": 9}}\n</tool_call>\n"
|
|
],
|
|
"expected": "72",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:29:24.204Z",
|
|
"passed": false,
|
|
"duration": 4354,
|
|
"reason": "Expected 72, but got {\"name\": \"multiply\", \"arguments\": {\"a\": 8, \"b\": 9}}\n</tool_call>",
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-weather",
|
|
"prompt": "What is the weather like in Berlin? Use the get_weather tool.",
|
|
"result": [
|
|
"{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Berlin\"}}\n</tool_call>"
|
|
],
|
|
"expected": "berlin",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:29:27.300Z",
|
|
"passed": true,
|
|
"duration": 3091,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-selection",
|
|
"prompt": "Use the add tool to calculate 100 plus 200. Do not use any other tool.",
|
|
"result": [
|
|
"{{\"name\": \"add\", \"arguments\": {\"a\": 100, \"b\": 200}}}\n</tool_call>\n"
|
|
],
|
|
"expected": "300",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:29:33.302Z",
|
|
"passed": false,
|
|
"duration": 5996,
|
|
"reason": "Expected 300, but got {{\"name\": \"add\", \"arguments\": {\"a\": 100, \"b\": 200}}}\n</tool_call>",
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-add",
|
|
"prompt": "Use the add tool to calculate 15 plus 27. Return the result.",
|
|
"result": [
|
|
"{\"name\": \"add\", \"arguments\": {\"a\": 15, \"b\": 27}}\n</tool_call>\n"
|
|
],
|
|
"expected": "42",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:30:23.715Z",
|
|
"passed": true,
|
|
"duration": 2949,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-multiply",
|
|
"prompt": "Use the multiply tool to calculate 8 times 9. Return the result.",
|
|
"result": [
|
|
"{\"name\": \"multiply\", \"arguments\": {\"a\": 8, \"b\": 9}}\n</tool_call>\n"
|
|
],
|
|
"expected": "72",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:30:25.848Z",
|
|
"passed": true,
|
|
"duration": 2127,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-weather",
|
|
"prompt": "What is the weather like in Berlin? Use the get_weather tool.",
|
|
"result": [
|
|
"{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Berlin\"}}\n</tool_call>"
|
|
],
|
|
"expected": "berlin",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:30:27.436Z",
|
|
"passed": true,
|
|
"duration": 1582,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-selection",
|
|
"prompt": "Use the add tool to calculate 100 plus 200. Do not use any other tool.",
|
|
"result": [
|
|
"{{\"name\": \"add\", \"arguments\": {\"a\": 100, \"b\": 200}}}\n</tool_call>"
|
|
],
|
|
"expected": "300",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:30:31.362Z",
|
|
"passed": false,
|
|
"duration": 3920,
|
|
"reason": "Expected 300, but got {{\"name\": \"add\", \"arguments\": {\"a\": 100, \"b\": 200}}}\n</tool_call>",
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-add",
|
|
"prompt": "Use the add tool to calculate 15 plus 27. Return the result.",
|
|
"result": [
|
|
"The sum of 15 and 27 is 42."
|
|
],
|
|
"expected": "42",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:39:17.199Z",
|
|
"passed": true,
|
|
"duration": 12126,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-multiply",
|
|
"prompt": "Use the multiply tool to calculate 8 times 9. Return the result.",
|
|
"result": [
|
|
"The result of 8 times 9 is 72."
|
|
],
|
|
"expected": "72",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:39:27.885Z",
|
|
"passed": true,
|
|
"duration": 10678,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-weather",
|
|
"prompt": "What is the weather like in Berlin? Use the get_weather tool.",
|
|
"result": [
|
|
"The current temperature in Berlin is 18 degrees Celsius and the weather condition is cloudy."
|
|
],
|
|
"expected": "berlin",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:39:38.036Z",
|
|
"passed": true,
|
|
"duration": 10144,
|
|
"category": "llama-tools"
|
|
},
|
|
{
|
|
"test": "tool-selection",
|
|
"prompt": "Use the add tool to calculate 100 plus 200. Do not use any other tool.",
|
|
"result": [
|
|
"The sum of 100 and 200 is 300."
|
|
],
|
|
"expected": "300",
|
|
"model": "default",
|
|
"router": "default",
|
|
"timestamp": "2026-03-19T17:39:53.564Z",
|
|
"passed": true,
|
|
"duration": 15522,
|
|
"category": "llama-tools"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "tool-add",
|
|
"rankings": [
|
|
{
|
|
"model": "default",
|
|
"duration": 12126,
|
|
"duration_secs": 12.126
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "tool-multiply",
|
|
"rankings": [
|
|
{
|
|
"model": "default",
|
|
"duration": 10678,
|
|
"duration_secs": 10.678
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "tool-weather",
|
|
"rankings": [
|
|
{
|
|
"model": "default",
|
|
"duration": 10144,
|
|
"duration_secs": 10.144
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "tool-selection",
|
|
"rankings": [
|
|
{
|
|
"model": "default",
|
|
"duration": 15522,
|
|
"duration_secs": 15.522
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2026-03-19T17:39:53.564Z"
|
|
} |