latest
This commit is contained in:
parent
2afc1a8051
commit
856ffe680f
File diff suppressed because one or more lines are too long
@ -1,9 +1,9 @@
|
||||
{
|
||||
"model": "google/gemini-2.0-flash-lite-001",
|
||||
"model": "qwen/qwq-32b",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a random number"
|
||||
"content": "divide 15 by 3. Return only the number, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -1,5 +1,27 @@
|
||||
# Basic Operations Test Results
|
||||
|
||||
## Highscores
|
||||
|
||||
### addition
|
||||
1. anthropic/claude-3.5-sonnet: 1278ms (1.28s)
|
||||
2. qwen/qwq-32b: 6285ms (6.29s)
|
||||
|
||||
### multiplication
|
||||
1. anthropic/claude-3.5-sonnet: 615ms (0.61s)
|
||||
2. qwen/qwq-32b: 9610ms (9.61s)
|
||||
|
||||
### division
|
||||
1. anthropic/claude-3.5-sonnet: 1242ms (1.24s)
|
||||
2. qwen/qwq-32b: 4040ms (4.04s)
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 6
|
||||
- Passed: 6
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Average Duration: 3845ms (3.85s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
*No failed tests*
|
||||
@ -10,48 +32,41 @@
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1551ms
|
||||
- Timestamp: 4/2/2025, 12:17:39 AM
|
||||
- Duration: 1278ms (1278.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:09 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 3621ms
|
||||
- Timestamp: 4/2/2025, 12:17:42 AM
|
||||
- Duration: 6285ms (6285.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:15 PM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 873ms
|
||||
- Timestamp: 4/2/2025, 12:17:43 AM
|
||||
- Duration: 615ms (615.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:16 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 3472ms
|
||||
- Timestamp: 4/2/2025, 12:17:47 AM
|
||||
- Duration: 9610ms (9610.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:25 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1183ms
|
||||
- Timestamp: 4/2/2025, 12:17:48 AM
|
||||
- Duration: 1242ms (1242.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:27 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 4841ms
|
||||
- Timestamp: 4/2/2025, 12:17:53 AM
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 6
|
||||
- Passed: 6
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Duration: 4040ms (4040.00s)
|
||||
- Timestamp: 4/2/2025, 12:49:31 PM
|
||||
|
||||
|
||||
@ -1,80 +1,541 @@
|
||||
[
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:39.340Z",
|
||||
"passed": true,
|
||||
"duration": 1551
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:42.962Z",
|
||||
"passed": true,
|
||||
"duration": 3621
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:43.836Z",
|
||||
"passed": true,
|
||||
"duration": 873
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:47.309Z",
|
||||
"passed": true,
|
||||
"duration": 3472
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:48.493Z",
|
||||
"passed": true,
|
||||
"duration": 1183
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:53.335Z",
|
||||
"passed": true,
|
||||
"duration": 4841
|
||||
}
|
||||
]
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:39.340Z",
|
||||
"passed": true,
|
||||
"duration": 1551
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:42.962Z",
|
||||
"passed": true,
|
||||
"duration": 3621
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:43.836Z",
|
||||
"passed": true,
|
||||
"duration": 873
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:47.309Z",
|
||||
"passed": true,
|
||||
"duration": 3472
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:48.493Z",
|
||||
"passed": true,
|
||||
"duration": 1183
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T22:17:53.335Z",
|
||||
"passed": true,
|
||||
"duration": 4841
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:37.069Z",
|
||||
"passed": true,
|
||||
"duration": 1256
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:40.167Z",
|
||||
"passed": false,
|
||||
"duration": 3096,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error",
|
||||
"details": {
|
||||
"stack": "Error: Model returned empty response\n at Module.runTest (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:85:13)\n at processTicksAndRejections (node:internal/process/task_queues:105:5)\n at __vite_ssr_import_0__.it.each.timeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\basic.test.ts:21:20)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:533:5\n at runTest (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1056:11)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runSuite (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1205:15)\n at runFiles (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1262:5)\n at startTests (file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/@vitest/runner/dist/index.js:1271:3)\n at file:///C:/Users/zx/Desktop/polymech/polymech-mono/packages/kbot/node_modules/vitest/dist/chunks/runBaseTests.3qpJUEJM.js:126:11",
|
||||
"message": "Model returned empty response"
|
||||
}
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:41.086Z",
|
||||
"passed": true,
|
||||
"duration": 916
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:45.879Z",
|
||||
"passed": true,
|
||||
"duration": 4793
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:46.900Z",
|
||||
"passed": true,
|
||||
"duration": 1020
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:38:50.446Z",
|
||||
"passed": true,
|
||||
"duration": 3545
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:39:58.836Z",
|
||||
"passed": true,
|
||||
"duration": 1266
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:02.777Z",
|
||||
"passed": true,
|
||||
"duration": 3939
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:03.961Z",
|
||||
"passed": true,
|
||||
"duration": 1183
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:06.962Z",
|
||||
"passed": true,
|
||||
"duration": 3000
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:08.115Z",
|
||||
"passed": true,
|
||||
"duration": 1152
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:40:12.565Z",
|
||||
"passed": true,
|
||||
"duration": 4449
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:52.176Z",
|
||||
"passed": true,
|
||||
"duration": 1458
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:55.869Z",
|
||||
"passed": true,
|
||||
"duration": 3691
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:57.106Z",
|
||||
"passed": true,
|
||||
"duration": 1236
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:41:59.974Z",
|
||||
"passed": true,
|
||||
"duration": 2867
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:42:01.272Z",
|
||||
"passed": true,
|
||||
"duration": 1297
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:42:04.326Z",
|
||||
"passed": true,
|
||||
"duration": 3053
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:39.002Z",
|
||||
"passed": true,
|
||||
"duration": 1196
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:48.668Z",
|
||||
"passed": true,
|
||||
"duration": 9664
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:49.806Z",
|
||||
"passed": true,
|
||||
"duration": 1137
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:53.017Z",
|
||||
"passed": true,
|
||||
"duration": 3210
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:53.814Z",
|
||||
"passed": true,
|
||||
"duration": 796
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:44:55.383Z",
|
||||
"passed": true,
|
||||
"duration": 1568
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:47:00.955Z",
|
||||
"passed": true,
|
||||
"duration": 1297
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:09.343Z",
|
||||
"passed": true,
|
||||
"duration": 1278
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:15.630Z",
|
||||
"passed": true,
|
||||
"duration": 6285
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:16.246Z",
|
||||
"passed": true,
|
||||
"duration": 615
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:25.857Z",
|
||||
"passed": true,
|
||||
"duration": 9610
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:27.101Z",
|
||||
"passed": true,
|
||||
"duration": 1242
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T10:49:31.142Z",
|
||||
"passed": true,
|
||||
"duration": 4040
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
{
|
||||
"test": "addition",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1278,
|
||||
"duration_secs": 1.278
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 6285,
|
||||
"duration_secs": 6.285
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 615,
|
||||
"duration_secs": 0.615
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 9610,
|
||||
"duration_secs": 9.61
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1242,
|
||||
"duration_secs": 1.242
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 4040,
|
||||
"duration_secs": 4.04
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T10:49:31.142Z"
|
||||
}
|
||||
@ -1,8 +1,5 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { run } from '../../src/index'
|
||||
import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import {
|
||||
models,
|
||||
@ -11,200 +8,54 @@ import {
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse
|
||||
runTest,
|
||||
generateTestReport
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
|
||||
|
||||
describe('Basic Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
|
||||
const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
let testResult: TestResult | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred'
|
||||
}
|
||||
throw e
|
||||
} finally {
|
||||
if (testResult) {
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
it.each(models)('should add two numbers with model %s', async (modelName) => {
|
||||
await runBasicTest(
|
||||
const result = await runTest(
|
||||
'add 5 and 3. Return only the number, no explanation.',
|
||||
'8',
|
||||
'addition',
|
||||
modelName
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8')
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
await runBasicTest(
|
||||
const result = await runTest(
|
||||
'multiply 8 and 3. Return only the number, no explanation.',
|
||||
'24',
|
||||
'multiplication',
|
||||
modelName
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('24')
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it.each(models)('should divide two numbers with model %s', async (modelName) => {
|
||||
await runBasicTest(
|
||||
const result = await runTest(
|
||||
'divide 15 by 3. Return only the number, no explanation.',
|
||||
'5',
|
||||
'division',
|
||||
modelName
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('5')
|
||||
}, { timeout: 10000 })
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Basic Operations Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
if (result.error.details?.message) {
|
||||
report += `- Error Details: ${result.error.details.message}\n`
|
||||
}
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Add summary section
|
||||
report += '## Summary\n\n'
|
||||
const totalTests = testResults.length
|
||||
const passedTests = testResults.filter(r => r.passed).length
|
||||
const failedTests = totalTests - passedTests
|
||||
report += `- Total Tests: ${totalTests}\n`
|
||||
report += `- Passed: ${passedTests}\n`
|
||||
report += `- Failed: ${failedTests}\n`
|
||||
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n\n`
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './basic-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
generateTestReport(testResults, 'Basic Operations Test Results', reportPath)
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -1,5 +1,9 @@
|
||||
import * as path from 'node:path'
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL } from '../../src/index'
|
||||
import { run } from '../../src/index'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
|
||||
export const models = [
|
||||
//E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
@ -31,6 +35,15 @@ export interface TestResult {
|
||||
duration?: number
|
||||
}
|
||||
|
||||
export interface TestHighscore {
|
||||
test: string;
|
||||
rankings: {
|
||||
model: string;
|
||||
duration: number;
|
||||
duration_secs: number;
|
||||
}[];
|
||||
}
|
||||
|
||||
export const formatError = (error: any): TestResult['error'] => {
|
||||
return {
|
||||
message: error?.message || 'Unknown error',
|
||||
@ -43,3 +56,251 @@ export const formatError = (error: any): TestResult['error'] => {
|
||||
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
|
||||
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
|
||||
}
|
||||
|
||||
export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
|
||||
const highscores: TestHighscore[] = []
|
||||
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
// Convert model results to array and sort by duration
|
||||
const sortedResults = Array.from(modelResults.entries())
|
||||
.map(([model, result]) => ({ model, result }))
|
||||
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
|
||||
.slice(0, 2) // Get top 2
|
||||
|
||||
if (sortedResults.length > 0) {
|
||||
highscores.push({
|
||||
test: testName,
|
||||
rankings: sortedResults.map(({ model, result }) => ({
|
||||
model,
|
||||
duration: result.duration || 0,
|
||||
duration_secs: (result.duration || 0) / 1000
|
||||
}))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return highscores
|
||||
}
|
||||
|
||||
export const runTest = async (
|
||||
prompt: string,
|
||||
expected: string,
|
||||
testName: string,
|
||||
modelName: string,
|
||||
logPath: string
|
||||
): Promise<TestResult> => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
let testResult: TestResult | undefined
|
||||
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
reason: 'Model returned empty response'
|
||||
}
|
||||
} else {
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred'
|
||||
}
|
||||
throw e
|
||||
} finally {
|
||||
if (testResult) {
|
||||
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath)) : { results: [], highscores: [] }
|
||||
const updatedResults = [...(existingData.results || []), testResult]
|
||||
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
updatedResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate highscores
|
||||
const highscores = generateHighscores(latestResults)
|
||||
|
||||
// Write both results and highscores
|
||||
write(logPath, JSON.stringify({
|
||||
results: updatedResults,
|
||||
highscores,
|
||||
lastUpdated: new Date().toISOString()
|
||||
}, null, 2))
|
||||
}
|
||||
}
|
||||
return testResult
|
||||
}
|
||||
|
||||
export const generateTestReport = (
|
||||
testResults: TestResult[],
|
||||
reportTitle: string,
|
||||
reportPath: string
|
||||
): void => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = `# ${reportTitle}\n\n`
|
||||
|
||||
// Add highscore section
|
||||
report += '## Highscores\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
report += `### ${testName}\n`
|
||||
|
||||
// Convert model results to array and sort by duration
|
||||
const sortedResults = Array.from(modelResults.entries())
|
||||
.map(([model, result]) => ({ model, result }))
|
||||
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
|
||||
.slice(0, 2) // Get top 2
|
||||
|
||||
if (sortedResults.length > 0) {
|
||||
sortedResults.forEach(({ model, result }, index) => {
|
||||
const duration = result.duration || 0
|
||||
report += `${index + 1}. ${model}: ${duration}ms (${(duration / 1000).toFixed(2)}s)\n`
|
||||
})
|
||||
} else {
|
||||
report += '*No results available*\n'
|
||||
}
|
||||
report += '\n'
|
||||
}
|
||||
|
||||
// Add summary section
|
||||
report += '## Summary\n\n'
|
||||
const totalTests = testResults.length
|
||||
const passedTests = testResults.filter(r => r.passed).length
|
||||
const failedTests = totalTests - passedTests
|
||||
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
|
||||
report += `- Total Tests: ${totalTests}\n`
|
||||
report += `- Passed: ${passedTests}\n`
|
||||
report += `- Failed: ${failedTests}\n`
|
||||
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
|
||||
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
if (result.error.details?.message) {
|
||||
report += `- Error Details: ${result.error.details.message}\n`
|
||||
}
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
write(reportPath, report)
|
||||
}
|
||||
|
||||
@ -6,21 +6,14 @@
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### json-schema-file-format - google/gemini-2.0-flash-lite-001
|
||||
### json-schema-file-format - mistralai/mistral-tiny
|
||||
- Prompt: `Create a user profile with name John Doe, age 30, and tags ["developer", "javascript"]. Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"John Doe","age":30,"tags":["developer","javascript"]}`
|
||||
- Actual: ````json
|
||||
{
|
||||
"name": "John Doe",
|
||||
"age": 30,
|
||||
"tags": ["developer", "javascript"]
|
||||
}
|
||||
```
|
||||
`
|
||||
- Duration: 1122ms
|
||||
- Timestamp: 4/2/2025, 9:03:33 AM
|
||||
- Actual: `{"name": "John Doe", "age": 30, "tags": ["developer", "javascript"]}`
|
||||
- Duration: 982ms
|
||||
- Timestamp: 4/2/2025, 10:26:33 AM
|
||||
|
||||
### json-schema-object-format - google/gemini-2.0-flash-lite-001
|
||||
### json-schema-object-format - mistralai/mistral-tiny
|
||||
- Prompt: `Create a user profile with the following details:
|
||||
- Name: Jane Smith
|
||||
- Age: 25
|
||||
@ -39,8 +32,7 @@
|
||||
}
|
||||
Return only the JSON object, no explanation.`
|
||||
- Expected: `{"name":"Jane Smith","age":25,"email":"jane.smith@company.com","tags":["developer","designer"],"address":{"street":"123 Main St","city":"New York","country":"US","postal_code":"10001"},"preferences":{"theme":"light","notifications":"enabled","language":"English"}}`
|
||||
- Actual: ````json
|
||||
{
|
||||
- Actual: `{
|
||||
"name": "Jane Smith",
|
||||
"age": 25,
|
||||
"email": "jane.smith@company.com",
|
||||
@ -51,16 +43,14 @@
|
||||
"country": "US",
|
||||
"postal_code": "10001"
|
||||
},
|
||||
"preferences": {
|
||||
"Preferences": {
|
||||
"theme": "light",
|
||||
"notifications": "enabled",
|
||||
"language": "English"
|
||||
}
|
||||
}
|
||||
```
|
||||
`
|
||||
- Duration: 1289ms
|
||||
- Timestamp: 4/2/2025, 9:03:34 AM
|
||||
}`
|
||||
- Duration: 1673ms
|
||||
- Timestamp: 4/2/2025, 10:26:35 AM
|
||||
|
||||
## Summary
|
||||
|
||||
|
||||
@ -3,26 +3,26 @@
|
||||
"test": "json-schema-file-format",
|
||||
"prompt": "Create a user profile with name John Doe, age 30, and tags [\"developer\", \"javascript\"]. Return only the JSON object, no explanation.",
|
||||
"result": [
|
||||
"```json\n{\n \"name\": \"John Doe\",\n \"age\": 30,\n \"tags\": [\"developer\", \"javascript\"]\n}\n```\n"
|
||||
"{\"name\": \"John Doe\", \"age\": 30, \"tags\": [\"developer\", \"javascript\"]}"
|
||||
],
|
||||
"expected": "{\"name\":\"John Doe\",\"age\":30,\"tags\":[\"developer\",\"javascript\"]}",
|
||||
"model": "google/gemini-2.0-flash-lite-001",
|
||||
"model": "mistralai/mistral-tiny",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T07:03:33.038Z",
|
||||
"timestamp": "2025-04-02T08:26:33.513Z",
|
||||
"passed": true,
|
||||
"duration": 1122
|
||||
"duration": 982
|
||||
},
|
||||
{
|
||||
"test": "json-schema-object-format",
|
||||
"prompt": "Create a user profile with the following details:\n - Name: Jane Smith\n - Age: 25\n - Email: jane.smith@company.com\n - Tags: [\"developer\", \"designer\"]\n - Address: {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n }\n - Preferences: {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n Return only the JSON object, no explanation.",
|
||||
"result": [
|
||||
"```json\n{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}\n```\n"
|
||||
"{\n \"name\": \"Jane Smith\",\n \"age\": 25,\n \"email\": \"jane.smith@company.com\",\n \"tags\": [\"developer\", \"designer\"],\n \"address\": {\n \"street\": \"123 Main St\",\n \"city\": \"New York\",\n \"country\": \"US\",\n \"postal_code\": \"10001\"\n },\n \"Preferences\": {\n \"theme\": \"light\",\n \"notifications\": \"enabled\",\n \"language\": \"English\"\n }\n}"
|
||||
],
|
||||
"expected": "{\"name\":\"Jane Smith\",\"age\":25,\"email\":\"jane.smith@company.com\",\"tags\":[\"developer\",\"designer\"],\"address\":{\"street\":\"123 Main St\",\"city\":\"New York\",\"country\":\"US\",\"postal_code\":\"10001\"},\"preferences\":{\"theme\":\"light\",\"notifications\":\"enabled\",\"language\":\"English\"}}",
|
||||
"model": "google/gemini-2.0-flash-lite-001",
|
||||
"model": "mistralai/mistral-tiny",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T07:03:34.328Z",
|
||||
"timestamp": "2025-04-02T08:26:35.187Z",
|
||||
"passed": true,
|
||||
"duration": 1289
|
||||
"duration": 1673
|
||||
}
|
||||
]
|
||||
@ -17,7 +17,8 @@ import {
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './format.json')
|
||||
const TEST_SCHEMA_PATH = path.resolve(__dirname, './test-schema.json')
|
||||
const TEST_MODEL = 'google/gemini-2.0-flash-lite-001'
|
||||
const TEST_MODEL_FAST = 'mistralai/codestral-2501'
|
||||
const TEST_MODEL = 'mistralai/mistral-tiny'
|
||||
const TEST_ROUTER = 'openrouter'
|
||||
|
||||
// Sample JSON Schema for testing
|
||||
|
||||
@ -40,6 +40,22 @@
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:55 PM
|
||||
|
||||
### german - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 1753ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:45 AM
|
||||
|
||||
### german - qwen/qwq-32b
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 13757ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:59 AM
|
||||
|
||||
### spanish - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
@ -78,6 +94,22 @@
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:56 PM
|
||||
|
||||
### spanish - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 1242ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:51 AM
|
||||
|
||||
### spanish - qwen/qwq-32b
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 5925ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:57 AM
|
||||
|
||||
### french - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
@ -116,6 +148,22 @@
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 11:53:57 PM
|
||||
|
||||
### french - anthropic/claude-3.5-sonnet
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 2656ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:29:59 AM
|
||||
|
||||
### french - qwen/qwq-32b
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 4063ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/2/2025, 10:30:03 AM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### german_translation - deepseek/deepseek-chat:free
|
||||
|
||||
@ -843,5 +843,77 @@
|
||||
"passed": false,
|
||||
"duration": 968,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:45.316Z",
|
||||
"passed": false,
|
||||
"duration": 1753,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:51.577Z",
|
||||
"passed": false,
|
||||
"duration": 1242,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:57.503Z",
|
||||
"passed": false,
|
||||
"duration": 5925,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:59.075Z",
|
||||
"passed": false,
|
||||
"duration": 13757,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:29:59.249Z",
|
||||
"passed": false,
|
||||
"duration": 2656,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-02T08:30:03.313Z",
|
||||
"passed": false,
|
||||
"duration": 4063,
|
||||
"reason": "Unknown error occurred"
|
||||
}
|
||||
]
|
||||
Loading…
Reference in New Issue
Block a user