tests:language - merci, habsch selbstgemacht :)
This commit is contained in:
parent
4ec4053898
commit
91517c7ac1
@ -3,7 +3,7 @@
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation."
|
||||
"content": "divide 15 by 3. Return only the number, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -17,8 +17,9 @@
|
||||
"dev": "tsc -p . --watch",
|
||||
"lint": "eslint src --ext .ts",
|
||||
"test": "vitest run",
|
||||
"test:unit": "vitest run tests/unit",
|
||||
"test:basic": "vitest run tests/unit/basic.test.ts",
|
||||
"test:math": "vitest run tests/unit/math.test.ts",
|
||||
"test:language": "vitest run tests/unit/language.test.ts",
|
||||
"test2:watch": "vitest",
|
||||
"test2:coverage": "vitest run --coverage",
|
||||
"webpack": "webpack --config webpack.config.js --stats-error-details",
|
||||
|
||||
109
packages/kbot/tests/unit/basic-report.md
Normal file
109
packages/kbot/tests/unit/basic-report.md
Normal file
@ -0,0 +1,109 @@
|
||||
# Basic Operations Test Results
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### basic_arithmetic - deepseek/deepseek-chat:free
|
||||
- Prompt: `return the result of 2+2, dont comment`
|
||||
- Expected: `undefined`
|
||||
- Actual: `4`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:30 PM
|
||||
|
||||
### basic_arithmetic - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return the result of 2+2, dont comment`
|
||||
- Expected: `undefined`
|
||||
- Actual: `4
|
||||
`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:31 PM
|
||||
|
||||
### basic_arithmetic - gpt-4
|
||||
- Prompt: `return the result of 2+2, dont comment`
|
||||
- Expected: `undefined`
|
||||
- Actual: `4`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:32 PM
|
||||
|
||||
### json_structure - deepseek/deepseek-chat:free
|
||||
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
|
||||
- Expected: `undefined`
|
||||
- Actual: `{"name":"test","value":42}`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:33 PM
|
||||
|
||||
### json_structure - gpt-4
|
||||
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
|
||||
- Expected: `undefined`
|
||||
- Actual: `{"name": "test", "value": 42}`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:36 PM
|
||||
|
||||
### json_structure - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `return a JSON object with two fields: "name" as "test" and "value" as 42. Return only the JSON, no other text.`
|
||||
- Expected: `undefined`
|
||||
- Actual: `{
|
||||
"name": "test",
|
||||
"value": 42
|
||||
}`
|
||||
- Reason: undefined
|
||||
- Timestamp: 4/1/2025, 12:26:35 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### addition - deepseek/deepseek-chat:free
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Timestamp: 4/1/2025, 12:59:06 PM
|
||||
|
||||
### addition - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:08 PM
|
||||
|
||||
### addition - gpt-4
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Timestamp: 4/1/2025, 1:00:35 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-chat:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Timestamp: 4/1/2025, 12:59:13 PM
|
||||
|
||||
### multiplication - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:15 PM
|
||||
|
||||
### multiplication - gpt-4
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Timestamp: 4/1/2025, 1:00:37 PM
|
||||
|
||||
### division - deepseek/deepseek-chat:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Timestamp: 4/1/2025, 12:59:18 PM
|
||||
|
||||
### division - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:56:09 PM
|
||||
|
||||
### division - gpt-4
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Timestamp: 4/1/2025, 1:00:40 PM
|
||||
|
||||
@ -228,5 +228,557 @@
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:26:36.239Z"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:03.882Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8\n"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:05.343Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:06.365Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:07.564Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24\n"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:09.013Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:09.968Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:10.981Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5\n"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:12.829Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:13.957Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:47.732Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8\n"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:50.075Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:50.977Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:52.835Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24\n"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:54.875Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:55.866Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:56.950Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:59.317Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:01.223Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8\n"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:02.717Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:03.867Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:04.864Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24\n"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:05.973Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:07.075Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:08.103Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5\n"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:09.298Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:10.178Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:32.641Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 4 and 6. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:35.038Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:37.567Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:06.742Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8\n"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:08.889Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:09.871Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:13.627Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24\n"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:15.963Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:17.124Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:18.527Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:20.298Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:58.244Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:00.620Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:03.143Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:10.210Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:12.798Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:15.604Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:35.256Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:37.681Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:40.556Z",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
@ -4,32 +4,10 @@ import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
const models = [
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE,
|
||||
E_OPENAI_MODEL.MODEL_GPT_4
|
||||
]
|
||||
|
||||
const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
||||
const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
|
||||
|
||||
interface TestResult {
|
||||
test: string;
|
||||
prompt: string;
|
||||
result: string[];
|
||||
expected: string;
|
||||
model: string;
|
||||
router: string;
|
||||
timestamp: string;
|
||||
passed: boolean;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
describe('Basic Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
@ -58,8 +36,8 @@ describe('Basic Operations', () => {
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result.map(r => r.trim())[0]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
@ -67,13 +45,13 @@ describe('Basic Operations', () => {
|
||||
testResults.push({
|
||||
test: 'addition',
|
||||
prompt,
|
||||
result,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
@ -81,7 +59,7 @@ describe('Basic Operations', () => {
|
||||
})
|
||||
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'multiply 4 and 6. Return only the number, no explanation.'
|
||||
const prompt = 'multiply 8 and 3. Return only the number, no explanation.'
|
||||
const expected = '24'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
@ -98,9 +76,9 @@ describe('Basic Operations', () => {
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
}) as string[]
|
||||
|
||||
const actual = result.map(r => r.trim())[0]
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
@ -108,13 +86,13 @@ describe('Basic Operations', () => {
|
||||
testResults.push({
|
||||
test: 'multiplication',
|
||||
prompt,
|
||||
result,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
@ -139,9 +117,9 @@ describe('Basic Operations', () => {
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
}) as string[]
|
||||
|
||||
const actual = result.map(r => r.trim())[0]
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
@ -149,16 +127,72 @@ describe('Basic Operations', () => {
|
||||
testResults.push({
|
||||
test: 'division',
|
||||
prompt,
|
||||
result,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Basic Operations Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './basic-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
24
packages/kbot/tests/unit/commons.ts
Normal file
24
packages/kbot/tests/unit/commons.ts
Normal file
@ -0,0 +1,24 @@
|
||||
import * as path from 'node:path'
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
export const models = [
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE,
|
||||
E_OPENAI_MODEL.MODEL_GPT_4
|
||||
]
|
||||
|
||||
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
||||
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
||||
|
||||
export interface TestResult {
|
||||
test: string;
|
||||
prompt: string;
|
||||
result: string[];
|
||||
expected: string;
|
||||
model: string;
|
||||
router: string;
|
||||
timestamp: string;
|
||||
passed: boolean;
|
||||
reason?: string;
|
||||
}
|
||||
113
packages/kbot/tests/unit/language-report.md
Normal file
113
packages/kbot/tests/unit/language-report.md
Normal file
@ -0,0 +1,113 @@
|
||||
# Language Test Results
|
||||
|
||||
## Failed Tests
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### german_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Timestamp: 4/1/2025, 12:56:01 PM
|
||||
|
||||
### german_translation - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:56:02 PM
|
||||
|
||||
### german_translation - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Timestamp: 4/1/2025, 12:56:32 PM
|
||||
|
||||
### spanish_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Timestamp: 4/1/2025, 12:56:05 PM
|
||||
|
||||
### spanish_translation - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:56:06 PM
|
||||
|
||||
### spanish_translation - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Timestamp: 4/1/2025, 12:56:35 PM
|
||||
|
||||
### french_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Timestamp: 4/1/2025, 12:56:08 PM
|
||||
|
||||
### french_translation - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:56:10 PM
|
||||
|
||||
### french_translation - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Timestamp: 4/1/2025, 12:56:37 PM
|
||||
|
||||
### german - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Timestamp: 4/1/2025, 12:59:07 PM
|
||||
|
||||
### german - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:09 PM
|
||||
|
||||
### german - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Timestamp: 4/1/2025, 12:59:11 PM
|
||||
|
||||
### spanish - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `Sí`
|
||||
- Timestamp: 4/1/2025, 12:59:12 PM
|
||||
|
||||
### spanish - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `Sí
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:14 PM
|
||||
|
||||
### spanish - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Timestamp: 4/1/2025, 12:59:15 PM
|
||||
|
||||
### french - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Timestamp: 4/1/2025, 12:59:18 PM
|
||||
|
||||
### french - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Timestamp: 4/1/2025, 12:59:20 PM
|
||||
|
||||
530
packages/kbot/tests/unit/language.json
Normal file
530
packages/kbot/tests/unit/language.json
Normal file
@ -0,0 +1,530 @@
|
||||
[
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:01.527Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:03.169Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:04.655Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:05.726Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:08.264Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:09.514Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:11.281Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non\n"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:12.942Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:50:14.089Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:53.360Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:54.551Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:55.699Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:56.740Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:57.918Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:51:58.882Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:52:00.119Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Non\n"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:52:01.184Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:52:02.204Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:50.924Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:52.477Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:53.546Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:56.815Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:58.718Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:54:01.673Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:01.199Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:02.857Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:03.788Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:05.218Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:06.317Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:07.436Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:08.879Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non\n"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:10.365Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:11.295Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german_translation",
|
||||
"prompt": "translate \"hello\" to German. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:32.735Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish_translation",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:35.276Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french_translation",
|
||||
"prompt": "translate \"no\" to French. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:37.673Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:07.944Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo\n"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:09.471Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:11.335Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:12.740Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Sí\n"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:14.246Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:15.205Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:18.139Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:20.589Z",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
200
packages/kbot/tests/unit/language.test.ts
Normal file
200
packages/kbot/tests/unit/language.test.ts
Normal file
@ -0,0 +1,200 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { run } from '../../src/index'
|
||||
import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './language.json')
|
||||
|
||||
describe('Language Capabilities', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
if (exists(TEST_LOG_PATH)) {
|
||||
const data = read(TEST_LOG_PATH, 'json')
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
|
||||
it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
|
||||
const prompt = 'translate "hello" to German. Return only the translated word, no explanation.'
|
||||
const expected = 'hallo'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'german',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => {
|
||||
const prompt = 'translate "yes" to Spanish. Return only the translated word, no explanation.'
|
||||
const expected = 'sí'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'spanish',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it.each(models)('should translate "no" to French with model %s', async (modelName) => {
|
||||
const prompt = 'translate "no" to French. Return only the translated word, no explanation.'
|
||||
const expected = 'non'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'french',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Language Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './language-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
@ -8,55 +8,55 @@
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Timestamp: 4/1/2025, 12:42:00 PM
|
||||
- Timestamp: 4/1/2025, 12:59:09 PM
|
||||
|
||||
### factorial - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:42:02 PM
|
||||
- Timestamp: 4/1/2025, 12:59:10 PM
|
||||
|
||||
### factorial - gpt-4
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Timestamp: 4/1/2025, 12:42:03 PM
|
||||
- Timestamp: 4/1/2025, 12:59:11 PM
|
||||
|
||||
### fibonacci - deepseek/deepseek-chat:free
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0,1,1,2,3`
|
||||
- Timestamp: 4/1/2025, 12:41:49 PM
|
||||
- Timestamp: 4/1/2025, 12:59:13 PM
|
||||
|
||||
### fibonacci - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0,1,1,2,3
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:42:10 PM
|
||||
- Timestamp: 4/1/2025, 12:59:14 PM
|
||||
|
||||
### fibonacci - gpt-4
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0, 1, 1, 2, 3`
|
||||
- Timestamp: 4/1/2025, 12:42:12 PM
|
||||
- Timestamp: 4/1/2025, 12:59:16 PM
|
||||
|
||||
### quadratic - deepseek/deepseek-chat:free
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `[-2, -3]`
|
||||
- Timestamp: 4/1/2025, 12:42:16 PM
|
||||
- Timestamp: 4/1/2025, 12:59:19 PM
|
||||
|
||||
### quadratic - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `[-2.0, -3.0]`
|
||||
- Timestamp: 4/1/2025, 12:42:17 PM
|
||||
- Actual: `[-2, -3]`
|
||||
- Timestamp: 4/1/2025, 12:46:13 PM
|
||||
|
||||
### quadratic - gpt-4
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `[-2, -3]`
|
||||
- Timestamp: 4/1/2025, 12:42:18 PM
|
||||
- Actual: `["-2", "-3"]`
|
||||
- Timestamp: 4/1/2025, 12:59:20 PM
|
||||
|
||||
|
||||
@ -552,5 +552,437 @@
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:42:18.707Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:04.344Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120\n"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:06.496Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:07.467Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:08.798Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3\n"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:09.986Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0, 1, 1, 2, 3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:11.342Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:12.578Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:13.716Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:46:14.857Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:47.605Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120\n"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:49.748Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:50.767Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:52.399Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3\n"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:53.994Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0, 1, 1, 2, 3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:55.463Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:56.689Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:53:59.331Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:01.640Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120\n"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:03.712Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:04.767Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:07.138Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3\n"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:08.328Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0, 1, 1, 2, 3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:09.776Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:10.999Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:14.081Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:32.818Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0, 1, 1, 2, 3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:35.563Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2,-3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:56:37.672Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:09.054Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120\n"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:10.238Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:11.167Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:13.500Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0,1,1,2,3\n"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:14.679Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0, 1, 1, 2, 3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:16.094Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:19.103Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[\"-2\", \"-3\"]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:20.963Z",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
@ -4,32 +4,12 @@ import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
const models = [
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE,
|
||||
E_OPENAI_MODEL.MODEL_GPT_4
|
||||
]
|
||||
|
||||
const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
||||
const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './math.json')
|
||||
|
||||
interface TestResult {
|
||||
test: string;
|
||||
prompt: string;
|
||||
result: string[];
|
||||
expected: string;
|
||||
model: string;
|
||||
router: string;
|
||||
timestamp: string;
|
||||
passed: boolean;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
describe('Advanced Math Operations', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
@ -57,9 +37,9 @@ describe('Advanced Math Operations', () => {
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result.map(r => r.trim())[0]
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
@ -67,13 +47,13 @@ describe('Advanced Math Operations', () => {
|
||||
testResults.push({
|
||||
test: 'factorial',
|
||||
prompt,
|
||||
result,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
@ -101,7 +81,7 @@ describe('Advanced Math Operations', () => {
|
||||
}) as string[]
|
||||
|
||||
// Handle both formats: "0,1,1,2,3" and "0, 1, 1, 2, 3"
|
||||
const numbers = result[0].trim().split(',').map(n => n.trim())
|
||||
const numbers = result?.[0]?.trim()?.split(',')?.map(n => n.trim()) || []
|
||||
const actual = numbers.join(',')
|
||||
const passed = actual === expected
|
||||
expect(numbers).toEqual(['0', '1', '1', '2', '3'])
|
||||
@ -110,13 +90,13 @@ describe('Advanced Math Operations', () => {
|
||||
testResults.push({
|
||||
test: 'fibonacci',
|
||||
prompt,
|
||||
result,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
@ -145,7 +125,24 @@ describe('Advanced Math Operations', () => {
|
||||
}) as string[]
|
||||
|
||||
// Parse the result as JSON (markdown already stripped by filter)
|
||||
const jsonResult = JSON.parse(result[0].trim())
|
||||
let jsonResult: number[]
|
||||
try {
|
||||
const resultStr = result?.[0]?.trim() || '[]'
|
||||
if (!resultStr) {
|
||||
throw new Error('No result returned from model')
|
||||
}
|
||||
jsonResult = JSON.parse(resultStr)
|
||||
if (!Array.isArray(jsonResult)) {
|
||||
throw new Error('Result is not an array')
|
||||
}
|
||||
// Convert any string numbers to actual numbers
|
||||
jsonResult = jsonResult.map(n => typeof n === 'string' ? parseFloat(n) : n)
|
||||
} catch (error) {
|
||||
// If parsing fails, try to extract numbers from the string
|
||||
const numbers = result?.[0]?.match(/-?\d+/g)?.map(n => parseInt(n, 10)) || []
|
||||
jsonResult = numbers
|
||||
}
|
||||
|
||||
const actual = JSON.stringify(jsonResult.sort())
|
||||
const expectedSorted = JSON.stringify([-3, -2].sort())
|
||||
const passed = actual === expectedSorted
|
||||
@ -155,16 +152,72 @@ describe('Advanced Math Operations', () => {
|
||||
testResults.push({
|
||||
test: 'quadratic',
|
||||
prompt,
|
||||
result,
|
||||
result: result || [],
|
||||
expected: expectedDisplay,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: passed ? undefined : `Expected ${expectedDisplay}, but got ${result[0]}`
|
||||
reason: !result?.[0] ? 'No result returned from model' : passed ? undefined : `Expected ${expectedDisplay}, but got ${result?.[0] || ''}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
// Group results by test and model
|
||||
const latestResults = new Map<string, Map<string, TestResult>>()
|
||||
|
||||
// Get only the latest result for each test+model combination
|
||||
testResults.forEach(result => {
|
||||
if (!latestResults.has(result.test)) {
|
||||
latestResults.set(result.test, new Map())
|
||||
}
|
||||
const testMap = latestResults.get(result.test)!
|
||||
const existingResult = testMap.get(result.model)
|
||||
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
||||
testMap.set(result.model, result)
|
||||
}
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Math Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './math-report.md')
|
||||
write(reportPath, report)
|
||||
|
||||
// Verify report was written
|
||||
expect(exists(reportPath) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user