revert

2025-04-03 00:07:12 +02:00 · 2025-04-03 00:07:12 +02:00 · 54b9c5dd9c
commit 54b9c5dd9c
parent 66cc6ac754
8 changed files with 339 additions and 438 deletions
--- a/packages/kbot/dist-in/zod_types.d.ts
+++ b/packages/kbot/dist-in/zod_types.d.ts
@ -2,7 +2,7 @@ export interface IKBotOptions {
    /** Target directory */
    path?: string;
    /** The prompt. Supports file paths and environment variables. */
-    prompt?: string;
+    prompt?: string | undefined;
    /** Optional output path for modified files (Tool mode only) */
    output?: string | undefined;
    /** Optional destination path for the result, will substitute ${MODEL_NAME} and ${ROUTER} in the path. Optional, used for "completion" mode */
--- a/packages/kbot/logs/params.json
+++ b/packages/kbot/logs/params.json
@ -1,207 +1,14 @@
 {
-  "model": "openai/gpt-4o",
+  "model": "openai/gpt-4o-mini",
  "messages": [
    {
      "role": "user",
-      "content": "List all files in the directory C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\test-data. Return the list as a JSON array of filenames."
+      "content": "divide 15 by 3. Return only the number, no explanation."
    },
    {
      "role": "user",
      "content": "USER Preferences : # Preferences\r\n\r\nYou are a helpful AI assistant. When asked to perform calculations, you should return only the numerical result without any explanation or comments. "
    }
  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "list_files",
-        "description": "List all files in a directory",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "directory": {
-              "type": "string"
-            },
-            "pattern": {
-              "type": "string",
-              "optional": true
-            }
-          },
-          "required": [
-            "directory"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "read_files",
-        "description": "Reads files in a directory with a given pattern",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "directory": {
-              "type": "string"
-            },
-            "pattern": {
-              "type": "string",
-              "optional": true
-            }
-          },
-          "required": [
-            "directory"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "remove_file",
-        "description": "Remove a file at given path",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "path": {
-              "type": "string"
-            }
-          },
-          "required": [
-            "path"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "rename_file",
-        "description": "Rename or move a file or directory",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "src": {
-              "type": "string"
-            },
-            "dst": {
-              "type": "string"
-            }
-          },
-          "required": [
-            "path"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "modify_project_files",
-        "description": "Create or modify existing project files in one shot, preferably used for creating project structure)",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "files": {
-              "type": "array",
-              "items": {
-                "type": "object",
-                "properties": {
-                  "path": {
-                    "type": "string"
-                  },
-                  "content": {
-                    "type": "string",
-                    "description": "base64 encoded string"
-                  }
-                },
-                "required": [
-                  "path",
-                  "content"
-                ]
-              }
-            }
-          },
-          "required": [
-            "files"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "write_file",
-        "description": "Writes to a file, given a path and content (base64). No directory or file exists check needed!",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "file": {
-              "type": "object",
-              "properties": {
-                "path": {
-                  "type": "string"
-                },
-                "content": {
-                  "type": "string",
-                  "description": "base64 encoded string"
-                }
-              }
-            }
-          },
-          "required": [
-            "file"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "file_exists",
-        "description": "check if a file or folder exists",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "file": {
-              "type": "object",
-              "properties": {
-                "path": {
-                  "type": "string"
-                }
-              }
-            }
-          },
-          "required": [
-            "file"
-          ]
-        }
-      }
-    },
-    {
-      "type": "function",
-      "function": {
-        "name": "read_file",
-        "description": "read a file, at given a path",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "file": {
-              "type": "object",
-              "properties": {
-                "path": {
-                  "type": "string"
-                }
-              }
-            }
-          },
-          "required": [
-            "file"
-          ]
-        }
-      }
-    }
-  ],
-  "tool_choice": "auto",
-  "parallel_tool_calls": false
+  "tools": []
 }
--- a/packages/kbot/schema.json
+++ b/packages/kbot/schema.json
@ -13,8 +13,7 @@
          },
          "prompt": {
            "type": "string",
-            "description": "The prompt. Supports file paths and environment variables.",
-            "default": "./prompt.md"
+            "description": "The prompt. Supports file paths and environment variables."
          },
          "output": {
            "type": "string",
@ -101,7 +100,7 @@
          },
          "logLevel": {
            "type": "number",
-            "default": 2,
+            "default": 4,
            "description": "Logging level for the application"
          },
          "profile": {
--- a/packages/kbot/schema_ui.json
+++ b/packages/kbot/schema_ui.json
@ -15,8 +15,7 @@
  },
  "prompt": {
    "ui:description": "The prompt. Supports file paths and environment variables.",
-    "ui:title": "Prompt",
-    "ui:placeholder": "./prompt.md"
+    "ui:title": "Prompt"
  },
  "output": {
    "ui:description": "Optional output path for modified files (Tool mode only)",
@ -79,7 +78,7 @@
  "logLevel": {
    "ui:description": "Logging level for the application",
    "ui:title": "Loglevel",
-    "ui:placeholder": 2
+    "ui:placeholder": 4
  },
  "profile": {
    "ui:description": "Path to profile for variables. Supports environment variables.",
--- a/packages/kbot/src/zod_types.ts
+++ b/packages/kbot/src/zod_types.ts
@ -2,7 +2,7 @@ export interface IKBotOptions {
    /** Target directory */
    path?: string;
    /** The prompt. Supports file paths and environment variables. */
-    prompt?: string;
+    prompt?: string | undefined;
    /** Optional output path for modified files (Tool mode only) */
    output?: string | undefined;
    /** Optional destination path for the result, will substitute ${MODEL_NAME} and ${ROUTER} in the path. Optional, used for "completion" mode */
--- a/packages/kbot/tests/unit/reports/all.json
+++ b/packages/kbot/tests/unit/reports/all.json
@ -6276,6 +6276,133 @@
      "duration": 2274,
      "reason": "Expected [], but got {\"files\":[]}",
      "category": "tools"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "openai/gpt-3.5-turbo",
+      "router": "openai/gpt-3.5-turbo",
+      "timestamp": "2025-04-02T22:06:38.904Z",
+      "passed": true,
+      "duration": 2263,
+      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "router": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "timestamp": "2025-04-02T22:06:41.138Z",
+      "passed": true,
+      "duration": 2228,
+      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-04-02T22:06:41.934Z",
+      "passed": true,
+      "duration": 791,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "openai/gpt-3.5-turbo",
+      "router": "openai/gpt-3.5-turbo",
+      "timestamp": "2025-04-02T22:06:43.239Z",
+      "passed": true,
+      "duration": 1300,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "8 × 3 = 24"
+      ],
+      "expected": "24",
+      "model": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "router": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "timestamp": "2025-04-02T22:06:46.025Z",
+      "passed": false,
+      "duration": 2782,
+      "reason": "Expected 24, but got 8 × 3 = 24",
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-04-02T22:06:47.239Z",
+      "passed": true,
+      "duration": 1206,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "openai/gpt-3.5-turbo",
+      "router": "openai/gpt-3.5-turbo",
+      "timestamp": "2025-04-02T22:06:48.026Z",
+      "passed": true,
+      "duration": 783,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "router": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "timestamp": "2025-04-02T22:06:51.012Z",
+      "passed": true,
+      "duration": 2982,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-04-02T22:06:51.777Z",
+      "passed": true,
+      "duration": 760,
+      "category": "basic"
    }
  ],
  "highscores": [
@ -6359,8 +6486,8 @@
      "rankings": [
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 885,
-          "duration_secs": 0.885
+          "duration": 791,
+          "duration_secs": 0.791
        },
        {
          "model": "anthropic/claude-3.5-sonnet",
@ -6373,14 +6500,14 @@
      "test": "multiplication",
      "rankings": [
        {
-          "model": "openai/gpt-3.5-turbo",
-          "duration": 984,
-          "duration_secs": 0.984
+          "model": "anthropic/claude-3.5-sonnet",
+          "duration": 1190,
+          "duration_secs": 1.19
        },
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 1111,
-          "duration_secs": 1.111
+          "duration": 1206,
+          "duration_secs": 1.206
        }
      ]
    },
@ -6388,14 +6515,14 @@
      "test": "division",
      "rankings": [
        {
-          "model": "openai/gpt-3.5-turbo",
-          "duration": 889,
-          "duration_secs": 0.889
+          "model": "openai/gpt-4o-mini",
+          "duration": 760,
+          "duration_secs": 0.76
        },
        {
-          "model": "qwen/qwq-32b",
-          "duration": 917,
-          "duration_secs": 0.917
+          "model": "openai/gpt-3.5-turbo",
+          "duration": 783,
+          "duration_secs": 0.783
        }
      ]
    },
@ -6460,5 +6587,5 @@
      ]
    }
  ],
-  "lastUpdated": "2025-04-02T19:29:30.523Z"
+  "lastUpdated": "2025-04-02T22:06:51.780Z"
 }
--- a/packages/kbot/tests/unit/reports/basic.json
+++ b/packages/kbot/tests/unit/reports/basic.json
@ -997,6 +997,133 @@
      "passed": true,
      "duration": 3646,
      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "openai/gpt-3.5-turbo",
+      "router": "openai/gpt-3.5-turbo",
+      "timestamp": "2025-04-02T22:06:38.904Z",
+      "passed": true,
+      "duration": 2263,
+      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "router": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "timestamp": "2025-04-02T22:06:41.138Z",
+      "passed": true,
+      "duration": 2228,
+      "category": "basic"
+    },
+    {
+      "test": "addition",
+      "prompt": "add 5 and 3. Return only the number, no explanation.",
+      "result": [
+        "8"
+      ],
+      "expected": "8",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-04-02T22:06:41.934Z",
+      "passed": true,
+      "duration": 791,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "openai/gpt-3.5-turbo",
+      "router": "openai/gpt-3.5-turbo",
+      "timestamp": "2025-04-02T22:06:43.239Z",
+      "passed": true,
+      "duration": 1300,
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "8 × 3 = 24"
+      ],
+      "expected": "24",
+      "model": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "router": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "timestamp": "2025-04-02T22:06:46.025Z",
+      "passed": false,
+      "duration": 2782,
+      "reason": "Expected 24, but got 8 × 3 = 24",
+      "category": "basic"
+    },
+    {
+      "test": "multiplication",
+      "prompt": "multiply 8 and 3. Return only the number, no explanation.",
+      "result": [
+        "24"
+      ],
+      "expected": "24",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-04-02T22:06:47.239Z",
+      "passed": true,
+      "duration": 1206,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "openai/gpt-3.5-turbo",
+      "router": "openai/gpt-3.5-turbo",
+      "timestamp": "2025-04-02T22:06:48.026Z",
+      "passed": true,
+      "duration": 783,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "router": "deepseek/deepseek-r1-distill-qwen-14b:free",
+      "timestamp": "2025-04-02T22:06:51.012Z",
+      "passed": true,
+      "duration": 2982,
+      "category": "basic"
+    },
+    {
+      "test": "division",
+      "prompt": "divide 15 by 3. Return only the number, no explanation.",
+      "result": [
+        "5"
+      ],
+      "expected": "5",
+      "model": "openai/gpt-4o-mini",
+      "router": "openai/gpt-4o-mini",
+      "timestamp": "2025-04-02T22:06:51.777Z",
+      "passed": true,
+      "duration": 760,
+      "category": "basic"
    }
  ],
  "highscores": [
@ -1005,8 +1132,8 @@
      "rankings": [
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 885,
-          "duration_secs": 0.885
+          "duration": 791,
+          "duration_secs": 0.791
        },
        {
          "model": "anthropic/claude-3.5-sonnet",
@ -1019,14 +1146,14 @@
      "test": "multiplication",
      "rankings": [
        {
-          "model": "openai/gpt-3.5-turbo",
-          "duration": 984,
-          "duration_secs": 0.984
+          "model": "anthropic/claude-3.5-sonnet",
+          "duration": 1190,
+          "duration_secs": 1.19
        },
        {
          "model": "openai/gpt-4o-mini",
-          "duration": 1111,
-          "duration_secs": 1.111
+          "duration": 1206,
+          "duration_secs": 1.206
        }
      ]
    },
@ -1034,17 +1161,17 @@
      "test": "division",
      "rankings": [
        {
-          "model": "openai/gpt-3.5-turbo",
-          "duration": 889,
-          "duration_secs": 0.889
+          "model": "openai/gpt-4o-mini",
+          "duration": 760,
+          "duration_secs": 0.76
        },
        {
-          "model": "qwen/qwq-32b",
-          "duration": 917,
-          "duration_secs": 0.917
+          "model": "openai/gpt-3.5-turbo",
+          "duration": 783,
+          "duration_secs": 0.783
        }
      ]
    }
  ],
-  "lastUpdated": "2025-04-02T13:45:10.308Z"
+  "lastUpdated": "2025-04-02T22:06:51.777Z"
 }
--- a/packages/kbot/tests/unit/reports/basic.md
+++ b/packages/kbot/tests/unit/reports/basic.md
@ -2,260 +2,102 @@

 ## Highscores

+### Performance Rankings (Duration)
+
 | Test | Model | Duration (ms) | Duration (s) |
 |------|-------|--------------|--------------|
-| addition | openai/gpt-4o-mini | 885 | 0.89 |
-| division | openai/gpt-3.5-turbo | 889 | 0.89 |
-| division | qwen/qwq-32b | 917 | 0.92 |
-| multiplication | openai/gpt-3.5-turbo | 984 | 0.98 |
-| division | openai/gpt-4o-mini | 1104 | 1.10 |
-| multiplication | openai/gpt-4o-mini | 1111 | 1.11 |
-| multiplication | anthropic/claude-3.5-sonnet | 1190 | 1.19 |
-| division | anthropic/claude-3.5-sonnet | 1405 | 1.41 |
-| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1558 | 1.56 |
-| addition | anthropic/claude-3.5-sonnet | 1689 | 1.69 |
-| division | deepseek/deepseek-r1-distill-qwen-14b:free | 3646 | 3.65 |
-| addition | qwen/qwq-32b | 3807 | 3.81 |
-| multiplication | qwen/qwq-32b | 5008 | 5.01 |
-| division | deepseek/deepseek-r1 | 7130 | 7.13 |
-| addition | openai/gpt-3.5-turbo | 10455 | 10.46 |
-| addition | deepseek/deepseek-r1 | 12064 | 12.06 |
+| addition | openai/gpt-4o-mini | 791 | 0.79 |
+| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 2228 | 2.23 |
+| addition | openai/gpt-3.5-turbo | 2263 | 2.26 |
+| multiplication | openai/gpt-4o-mini | 1206 | 1.21 |
+| multiplication | openai/gpt-3.5-turbo | 1300 | 1.30 |
+| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 2782 | 2.78 |
+| division | openai/gpt-4o-mini | 760 | 0.76 |
+| division | openai/gpt-3.5-turbo | 783 | 0.78 |
+| division | deepseek/deepseek-r1-distill-qwen-14b:free | 2982 | 2.98 |

 ## Summary

- Total Tests: 18
- Passed: 16
- Failed: 2
+- Total Tests: 9
+- Passed: 8
+- Failed: 1
 - Success Rate: 88.89%
- Average Duration: 3639ms (3.64s)
+- Average Duration: 1677ms (1.68s)

 ## Failed Tests

-### addition - deepseek/deepseek-r1-distill-qwen-14b:free
-
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `The sum of 5 and 3 is 8. Therefore, the result is \boxed{8}.`
- Duration: 6405ms (6405.00s)
- Reason: Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \boxed{8}.
- Timestamp: 4/2/2025, 3:44:40 PM
-
-### multiplication - deepseek/deepseek-r1
+### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free

 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
 - Expected: `24`
- Actual: `24
-
-24
-
-The result is 24.
-
-24
-
-Here's the answer: 24
-
-The answer will be 24.
-
-24
-
-24
-
-The product of 8 and 3 is 24.
-
-24
-
-The answer is 24.
-
-24
-
-24
-
-24
-
-The result is 24.
-
-24
-
-Here's the numerical result: 24
-
-The answer is 24.
-
-24
-
-24
-
-The answer is 24.`
- Duration: 5258ms (5258.00s)
- Reason: Expected 24, but got 24
-
-24
-
-the result is 24.
-
-24
-
-here's the answer: 24
-
-the answer will be 24.
-
-24
-
-24
-
-the product of 8 and 3 is 24.
-
-24
-
-the answer is 24.
-
-24
-
-24
-
-24
-
-the result is 24.
-
-24
-
-here's the numerical result: 24
-
-the answer is 24.
-
-24
-
-24
-
-the answer is 24.
- Timestamp: 4/2/2025, 3:44:53 PM
+- Actual: `8 × 3 = 24`
+- Duration: 2782ms (2.78s)
+- Reason: Expected 24, but got 8 × 3 = 24
+- Timestamp: 4/3/2025, 12:06:46 AM

 ## Passed Tests

-### addition - anthropic/claude-3.5-sonnet
-
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1689ms (1689.00s)
- Timestamp: 4/2/2025, 3:44:06 PM
-
-### addition - qwen/qwq-32b
-
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 3807ms (3807.00s)
- Timestamp: 4/2/2025, 3:44:10 PM
-
-### addition - openai/gpt-4o-mini
-
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 885ms (885.00s)
- Timestamp: 4/2/2025, 3:44:11 PM
-
 ### addition - openai/gpt-3.5-turbo

 - Prompt: `add 5 and 3. Return only the number, no explanation.`
 - Expected: `8`
 - Actual: `8`
- Duration: 10455ms (10455.00s)
- Timestamp: 4/2/2025, 3:44:21 PM
+- Duration: 2263ms (2.26s)
+- Timestamp: 4/3/2025, 12:06:38 AM

-### addition - deepseek/deepseek-r1
+### addition - deepseek/deepseek-r1-distill-qwen-14b:free

 - Prompt: `add 5 and 3. Return only the number, no explanation.`
 - Expected: `8`
 - Actual: `8`
- Duration: 12064ms (12064.00s)
- Timestamp: 4/2/2025, 3:44:33 PM
+- Duration: 2228ms (2.23s)
+- Timestamp: 4/3/2025, 12:06:41 AM

-### multiplication - anthropic/claude-3.5-sonnet
+### addition - openai/gpt-4o-mini

- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1190ms (1190.00s)
- Timestamp: 4/2/2025, 3:44:41 PM
-
-### multiplication - qwen/qwq-32b
-
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 5008ms (5008.00s)
- Timestamp: 4/2/2025, 3:44:46 PM
-
-### multiplication - openai/gpt-4o-mini
-
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1111ms (1111.00s)
- Timestamp: 4/2/2025, 3:44:47 PM
+- Prompt: `add 5 and 3. Return only the number, no explanation.`
+- Expected: `8`
+- Actual: `8`
+- Duration: 791ms (0.79s)
+- Timestamp: 4/3/2025, 12:06:41 AM

 ### multiplication - openai/gpt-3.5-turbo

 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
 - Expected: `24`
 - Actual: `24`
- Duration: 984ms (984.00s)
- Timestamp: 4/2/2025, 3:44:48 PM
+- Duration: 1300ms (1.30s)
+- Timestamp: 4/3/2025, 12:06:43 AM

-### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
+### multiplication - openai/gpt-4o-mini

 - Prompt: `multiply 8 and 3. Return only the number, no explanation.`
 - Expected: `24`
 - Actual: `24`
- Duration: 1558ms (1558.00s)
- Timestamp: 4/2/2025, 3:44:55 PM
-
-### division - anthropic/claude-3.5-sonnet
-
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1405ms (1405.00s)
- Timestamp: 4/2/2025, 3:44:56 PM
-
-### division - qwen/qwq-32b
-
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 917ms (917.00s)
- Timestamp: 4/2/2025, 3:44:57 PM
-
-### division - openai/gpt-4o-mini
-
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1104ms (1104.00s)
- Timestamp: 4/2/2025, 3:44:58 PM
+- Duration: 1206ms (1.21s)
+- Timestamp: 4/3/2025, 12:06:47 AM

 ### division - openai/gpt-3.5-turbo

 - Prompt: `divide 15 by 3. Return only the number, no explanation.`
 - Expected: `5`
 - Actual: `5`
- Duration: 889ms (889.00s)
- Timestamp: 4/2/2025, 3:44:59 PM
-
-### division - deepseek/deepseek-r1
-
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 7130ms (7130.00s)
- Timestamp: 4/2/2025, 3:45:06 PM
+- Duration: 783ms (0.78s)
+- Timestamp: 4/3/2025, 12:06:48 AM

 ### division - deepseek/deepseek-r1-distill-qwen-14b:free

 - Prompt: `divide 15 by 3. Return only the number, no explanation.`
 - Expected: `5`
 - Actual: `5`
- Duration: 3646ms (3646.00s)
- Timestamp: 4/2/2025, 3:45:10 PM
+- Duration: 2982ms (2.98s)
+- Timestamp: 4/3/2025, 12:06:51 AM
+
+### division - openai/gpt-4o-mini
+
+- Prompt: `divide 15 by 3. Return only the number, no explanation.`
+- Expected: `5`
+- Actual: `5`
+- Duration: 760ms (0.76s)
+- Timestamp: 4/3/2025, 12:06:51 AM