revert
This commit is contained in:
parent
66cc6ac754
commit
54b9c5dd9c
2
packages/kbot/dist-in/zod_types.d.ts
vendored
2
packages/kbot/dist-in/zod_types.d.ts
vendored
@ -2,7 +2,7 @@ export interface IKBotOptions {
|
||||
/** Target directory */
|
||||
path?: string;
|
||||
/** The prompt. Supports file paths and environment variables. */
|
||||
prompt?: string;
|
||||
prompt?: string | undefined;
|
||||
/** Optional output path for modified files (Tool mode only) */
|
||||
output?: string | undefined;
|
||||
/** Optional destination path for the result, will substitute ${MODEL_NAME} and ${ROUTER} in the path. Optional, used for "completion" mode */
|
||||
|
||||
@ -1,207 +1,14 @@
|
||||
{
|
||||
"model": "openai/gpt-4o",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "List all files in the directory C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\test-data. Return the list as a JSON array of filenames."
|
||||
"content": "divide 15 by 3. Return only the number, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "USER Preferences : # Preferences\r\n\r\nYou are a helpful AI assistant. When asked to perform calculations, you should return only the numerical result without any explanation or comments. "
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "list_files",
|
||||
"description": "List all files in a directory",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"directory": {
|
||||
"type": "string"
|
||||
},
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"optional": true
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"directory"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_files",
|
||||
"description": "Reads files in a directory with a given pattern",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"directory": {
|
||||
"type": "string"
|
||||
},
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"optional": true
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"directory"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "remove_file",
|
||||
"description": "Remove a file at given path",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"path"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rename_file",
|
||||
"description": "Rename or move a file or directory",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"src": {
|
||||
"type": "string"
|
||||
},
|
||||
"dst": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"path"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "modify_project_files",
|
||||
"description": "Create or modify existing project files in one shot, preferably used for creating project structure)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"files": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "base64 encoded string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"path",
|
||||
"content"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"files"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "write_file",
|
||||
"description": "Writes to a file, given a path and content (base64). No directory or file exists check needed!",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "base64 encoded string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"file"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "file_exists",
|
||||
"description": "check if a file or folder exists",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"file"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "read a file, at given a path",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"file"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"tool_choice": "auto",
|
||||
"parallel_tool_calls": false
|
||||
"tools": []
|
||||
}
|
||||
@ -13,8 +13,7 @@
|
||||
},
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The prompt. Supports file paths and environment variables.",
|
||||
"default": "./prompt.md"
|
||||
"description": "The prompt. Supports file paths and environment variables."
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
@ -101,7 +100,7 @@
|
||||
},
|
||||
"logLevel": {
|
||||
"type": "number",
|
||||
"default": 2,
|
||||
"default": 4,
|
||||
"description": "Logging level for the application"
|
||||
},
|
||||
"profile": {
|
||||
|
||||
@ -15,8 +15,7 @@
|
||||
},
|
||||
"prompt": {
|
||||
"ui:description": "The prompt. Supports file paths and environment variables.",
|
||||
"ui:title": "Prompt",
|
||||
"ui:placeholder": "./prompt.md"
|
||||
"ui:title": "Prompt"
|
||||
},
|
||||
"output": {
|
||||
"ui:description": "Optional output path for modified files (Tool mode only)",
|
||||
@ -79,7 +78,7 @@
|
||||
"logLevel": {
|
||||
"ui:description": "Logging level for the application",
|
||||
"ui:title": "Loglevel",
|
||||
"ui:placeholder": 2
|
||||
"ui:placeholder": 4
|
||||
},
|
||||
"profile": {
|
||||
"ui:description": "Path to profile for variables. Supports environment variables.",
|
||||
|
||||
@ -2,7 +2,7 @@ export interface IKBotOptions {
|
||||
/** Target directory */
|
||||
path?: string;
|
||||
/** The prompt. Supports file paths and environment variables. */
|
||||
prompt?: string;
|
||||
prompt?: string | undefined;
|
||||
/** Optional output path for modified files (Tool mode only) */
|
||||
output?: string | undefined;
|
||||
/** Optional destination path for the result, will substitute ${MODEL_NAME} and ${ROUTER} in the path. Optional, used for "completion" mode */
|
||||
|
||||
@ -6276,6 +6276,133 @@
|
||||
"duration": 2274,
|
||||
"reason": "Expected [], but got {\"files\":[]}",
|
||||
"category": "tools"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T22:06:38.904Z",
|
||||
"passed": true,
|
||||
"duration": 2263,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T22:06:41.138Z",
|
||||
"passed": true,
|
||||
"duration": 2228,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T22:06:41.934Z",
|
||||
"passed": true,
|
||||
"duration": 791,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T22:06:43.239Z",
|
||||
"passed": true,
|
||||
"duration": 1300,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8 × 3 = 24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T22:06:46.025Z",
|
||||
"passed": false,
|
||||
"duration": 2782,
|
||||
"reason": "Expected 24, but got 8 × 3 = 24",
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T22:06:47.239Z",
|
||||
"passed": true,
|
||||
"duration": 1206,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T22:06:48.026Z",
|
||||
"passed": true,
|
||||
"duration": 783,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T22:06:51.012Z",
|
||||
"passed": true,
|
||||
"duration": 2982,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T22:06:51.777Z",
|
||||
"passed": true,
|
||||
"duration": 760,
|
||||
"category": "basic"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -6359,8 +6486,8 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 885,
|
||||
"duration_secs": 0.885
|
||||
"duration": 791,
|
||||
"duration_secs": 0.791
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
@ -6373,14 +6500,14 @@
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 984,
|
||||
"duration_secs": 0.984
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1190,
|
||||
"duration_secs": 1.19
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1111,
|
||||
"duration_secs": 1.111
|
||||
"duration": 1206,
|
||||
"duration_secs": 1.206
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -6388,14 +6515,14 @@
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 889,
|
||||
"duration_secs": 0.889
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 760,
|
||||
"duration_secs": 0.76
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 917,
|
||||
"duration_secs": 0.917
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 783,
|
||||
"duration_secs": 0.783
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -6460,5 +6587,5 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T19:29:30.523Z"
|
||||
"lastUpdated": "2025-04-02T22:06:51.780Z"
|
||||
}
|
||||
@ -997,6 +997,133 @@
|
||||
"passed": true,
|
||||
"duration": 3646,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T22:06:38.904Z",
|
||||
"passed": true,
|
||||
"duration": 2263,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T22:06:41.138Z",
|
||||
"passed": true,
|
||||
"duration": 2228,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T22:06:41.934Z",
|
||||
"passed": true,
|
||||
"duration": 791,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T22:06:43.239Z",
|
||||
"passed": true,
|
||||
"duration": 1300,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8 × 3 = 24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T22:06:46.025Z",
|
||||
"passed": false,
|
||||
"duration": 2782,
|
||||
"reason": "Expected 24, but got 8 × 3 = 24",
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T22:06:47.239Z",
|
||||
"passed": true,
|
||||
"duration": 1206,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T22:06:48.026Z",
|
||||
"passed": true,
|
||||
"duration": 783,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T22:06:51.012Z",
|
||||
"passed": true,
|
||||
"duration": 2982,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T22:06:51.777Z",
|
||||
"passed": true,
|
||||
"duration": 760,
|
||||
"category": "basic"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -1005,8 +1132,8 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 885,
|
||||
"duration_secs": 0.885
|
||||
"duration": 791,
|
||||
"duration_secs": 0.791
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
@ -1019,14 +1146,14 @@
|
||||
"test": "multiplication",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 984,
|
||||
"duration_secs": 0.984
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1190,
|
||||
"duration_secs": 1.19
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1111,
|
||||
"duration_secs": 1.111
|
||||
"duration": 1206,
|
||||
"duration_secs": 1.206
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1034,17 +1161,17 @@
|
||||
"test": "division",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 889,
|
||||
"duration_secs": 0.889
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 760,
|
||||
"duration_secs": 0.76
|
||||
},
|
||||
{
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 917,
|
||||
"duration_secs": 0.917
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 783,
|
||||
"duration_secs": 0.783
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T13:45:10.308Z"
|
||||
"lastUpdated": "2025-04-02T22:06:51.777Z"
|
||||
}
|
||||
@ -2,260 +2,102 @@
|
||||
|
||||
## Highscores
|
||||
|
||||
### Performance Rankings (Duration)
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| addition | openai/gpt-4o-mini | 885 | 0.89 |
|
||||
| division | openai/gpt-3.5-turbo | 889 | 0.89 |
|
||||
| division | qwen/qwq-32b | 917 | 0.92 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 984 | 0.98 |
|
||||
| division | openai/gpt-4o-mini | 1104 | 1.10 |
|
||||
| multiplication | openai/gpt-4o-mini | 1111 | 1.11 |
|
||||
| multiplication | anthropic/claude-3.5-sonnet | 1190 | 1.19 |
|
||||
| division | anthropic/claude-3.5-sonnet | 1405 | 1.41 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1558 | 1.56 |
|
||||
| addition | anthropic/claude-3.5-sonnet | 1689 | 1.69 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 3646 | 3.65 |
|
||||
| addition | qwen/qwq-32b | 3807 | 3.81 |
|
||||
| multiplication | qwen/qwq-32b | 5008 | 5.01 |
|
||||
| division | deepseek/deepseek-r1 | 7130 | 7.13 |
|
||||
| addition | openai/gpt-3.5-turbo | 10455 | 10.46 |
|
||||
| addition | deepseek/deepseek-r1 | 12064 | 12.06 |
|
||||
| addition | openai/gpt-4o-mini | 791 | 0.79 |
|
||||
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 2228 | 2.23 |
|
||||
| addition | openai/gpt-3.5-turbo | 2263 | 2.26 |
|
||||
| multiplication | openai/gpt-4o-mini | 1206 | 1.21 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 1300 | 1.30 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 2782 | 2.78 |
|
||||
| division | openai/gpt-4o-mini | 760 | 0.76 |
|
||||
| division | openai/gpt-3.5-turbo | 783 | 0.78 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 2982 | 2.98 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 18
|
||||
- Passed: 16
|
||||
- Failed: 2
|
||||
- Total Tests: 9
|
||||
- Passed: 8
|
||||
- Failed: 1
|
||||
- Success Rate: 88.89%
|
||||
- Average Duration: 3639ms (3.64s)
|
||||
- Average Duration: 1677ms (1.68s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `The sum of 5 and 3 is 8. Therefore, the result is \boxed{8}.`
|
||||
- Duration: 6405ms (6405.00s)
|
||||
- Reason: Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \boxed{8}.
|
||||
- Timestamp: 4/2/2025, 3:44:40 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1
|
||||
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24
|
||||
|
||||
24
|
||||
|
||||
The result is 24.
|
||||
|
||||
24
|
||||
|
||||
Here's the answer: 24
|
||||
|
||||
The answer will be 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
The product of 8 and 3 is 24.
|
||||
|
||||
24
|
||||
|
||||
The answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
The result is 24.
|
||||
|
||||
24
|
||||
|
||||
Here's the numerical result: 24
|
||||
|
||||
The answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
The answer is 24.`
|
||||
- Duration: 5258ms (5258.00s)
|
||||
- Reason: Expected 24, but got 24
|
||||
|
||||
24
|
||||
|
||||
the result is 24.
|
||||
|
||||
24
|
||||
|
||||
here's the answer: 24
|
||||
|
||||
the answer will be 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
the product of 8 and 3 is 24.
|
||||
|
||||
24
|
||||
|
||||
the answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
the result is 24.
|
||||
|
||||
24
|
||||
|
||||
here's the numerical result: 24
|
||||
|
||||
the answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
the answer is 24.
|
||||
- Timestamp: 4/2/2025, 3:44:53 PM
|
||||
- Actual: `8 × 3 = 24`
|
||||
- Duration: 2782ms (2.78s)
|
||||
- Reason: Expected 24, but got 8 × 3 = 24
|
||||
- Timestamp: 4/3/2025, 12:06:46 AM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### addition - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1689ms (1689.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:06 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 3807ms (3807.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:10 PM
|
||||
|
||||
### addition - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 885ms (885.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:11 PM
|
||||
|
||||
### addition - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 10455ms (10455.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:21 PM
|
||||
- Duration: 2263ms (2.26s)
|
||||
- Timestamp: 4/3/2025, 12:06:38 AM
|
||||
|
||||
### addition - deepseek/deepseek-r1
|
||||
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 12064ms (12064.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:33 PM
|
||||
- Duration: 2228ms (2.23s)
|
||||
- Timestamp: 4/3/2025, 12:06:41 AM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
### addition - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1190ms (1190.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:41 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 5008ms (5008.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:46 PM
|
||||
|
||||
### multiplication - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1111ms (1111.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:47 PM
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 791ms (0.79s)
|
||||
- Timestamp: 4/3/2025, 12:06:41 AM
|
||||
|
||||
### multiplication - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 984ms (984.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:48 PM
|
||||
- Duration: 1300ms (1.30s)
|
||||
- Timestamp: 4/3/2025, 12:06:43 AM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
### multiplication - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1558ms (1558.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:55 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1405ms (1405.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:56 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 917ms (917.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:57 PM
|
||||
|
||||
### division - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1104ms (1104.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:58 PM
|
||||
- Duration: 1206ms (1.21s)
|
||||
- Timestamp: 4/3/2025, 12:06:47 AM
|
||||
|
||||
### division - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 889ms (889.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:59 PM
|
||||
|
||||
### division - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 7130ms (7130.00s)
|
||||
- Timestamp: 4/2/2025, 3:45:06 PM
|
||||
- Duration: 783ms (0.78s)
|
||||
- Timestamp: 4/3/2025, 12:06:48 AM
|
||||
|
||||
### division - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 3646ms (3646.00s)
|
||||
- Timestamp: 4/2/2025, 3:45:10 PM
|
||||
- Duration: 2982ms (2.98s)
|
||||
- Timestamp: 4/3/2025, 12:06:51 AM
|
||||
|
||||
### division - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 760ms (0.76s)
|
||||
- Timestamp: 4/3/2025, 12:06:51 AM
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user