kbot tests: seo

This commit is contained in:
lovebird 2025-04-02 16:26:00 +02:00
parent bc644f8635
commit e51e8a5b7b
18 changed files with 10595 additions and 254 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,5 @@
export declare const logger: any;
import { Logger, ILogObj } from 'tslog';
export declare const logger: Logger<ILogObj>;
export { run } from './commands/run.js';
export declare const module_root: () => string;
export declare const assistant_supported: Record<string, string>;

View File

@ -1,9 +1,9 @@
import { platform } from 'node:process';
import path from 'node:path';
const isWindows = platform === 'win32';
import { createLogger } from '@polymech/log';
import { get_var } from '@polymech/commons';
import { MODULE_NAME } from './constants.js';
import { createLogger } from '@polymech/log';
export const logger = createLogger('llm-tools');
export { run } from './commands/run.js';
export const module_root = () => path.resolve(path.join(get_var(isWindows ? 'HOMEPATH' : 'HOME'), `.${MODULE_NAME}`));
@ -36,4 +36,4 @@ export * from './zod_schema.js';
export { E_OPENAI_MODEL } from './models/cache/openai-models.js';
export { E_OPENROUTER_MODEL } from './models/cache/openrouter-models.js';
export { E_OPENROUTER_MODEL_FREE } from './models/cache/openrouter-models-free.js';
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUEsT0FBTyxFQUFFLFFBQVEsRUFBRSxNQUFNLGNBQWMsQ0FBQTtBQUN2QyxPQUFPLElBQUksTUFBTSxXQUFXLENBQUE7QUFFNUIsTUFBTSxTQUFTLEdBQUcsUUFBUSxLQUFLLE9BQU8sQ0FBQTtBQUV0QyxPQUFPLEVBQUUsWUFBWSxFQUFFLE1BQU0sZUFBZSxDQUFBO0FBQzVDLE9BQU8sRUFBRSxPQUFPLEVBQUUsTUFBTSxtQkFBbUIsQ0FBQTtBQUMzQyxPQUFPLEVBQUUsV0FBVyxFQUFFLE1BQU0sZ0JBQWdCLENBQUE7QUFFNUMsTUFBTSxDQUFDLE1BQU0sTUFBTSxHQUFRLFlBQVksQ0FBQyxXQUFXLENBQUMsQ0FBQTtBQUNwRCxPQUFPLEVBQUUsR0FBRyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFDdkMsTUFBTSxDQUFDLE1BQU0sV0FBVyxHQUFHLEdBQUcsRUFBRSxDQUFDLElBQUksQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLElBQUksQ0FBQyxPQUFPLENBQUMsU0FBUyxDQUFDLENBQUMsQ0FBQyxVQUFVLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQyxFQUFFLElBQUksV0FBVyxFQUFFLENBQUMsQ0FBQyxDQUFBO0FBRXJILE1BQU0sQ0FBQyxNQUFNLG1CQUFtQixHQUEyQjtJQUN6RCxJQUFJLEVBQUUsVUFBVTtJQUNoQixNQUFNLEVBQUUsWUFBWTtJQUNwQixLQUFLLEVBQUUsZUFBZTtJQUN0QixNQUFNLEVBQUUsVUFBVTtJQUNsQixNQUFNLEVBQUUsb0JBQW9CO0lBQzVCLE9BQU8sRUFBRSx5RUFBeUU7SUFDbEYsS0FBSyxFQUFFLGVBQWU7SUFDdEIsT0FBTyxFQUFFLFdBQVc7SUFDcEIsT0FBTyxFQUFFLGFBQWE7SUFDdEIsS0FBSyxFQUFFLGlCQUFpQjtJQUN4QixPQUFPLEVBQUUsa0JBQWtCO0lBQzNCLEtBQUssRUFBRSxlQUFlO0lBQ3RCLE1BQU0sRUFBRSxpQkFBaUI7SUFDekIsTUFBTSxFQUFFLFlBQVk7SUFDcEIsT0FBTyxFQUFFLDJFQUEyRTtJQUNwRixLQUFLLEVBQUUsZUFBZTtJQUN0QixLQUFLLEVBQUUsYUFBYTtJQUNwQixLQUFLLEVBQUUsa0JBQWtCO0lBQ3pCLE1BQU0sRUFBRSxZQUFZO0lBQ3BCLEtBQUssRUFBRSx3QkFBd0I7SUFDL0IsTUFBTSxFQUFFLFlBQVk7Q0FDckIsQ0FBQTtBQUNELGNBQWMsWUFBWSxDQUFBO0FBQzFCLGNBQWMsZ0JBQWdCLENBQUE7QUFDOUIsY0FBYyxpQkFBaUIsQ0FBQTtBQUUvQixPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0saUNBQWlDLENBQUE7QUFDaEUsT0FBTyxFQUFFLGtCQUFrQixFQUFFLE1BQU0scUNBQXFDLENBQUE7QUFDeEUsT0FBTyxFQUFFLHVCQUF1QixFQUFFLE1BQU0sMENBQTBDLENBQUEifQ==
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUEsT0FBTyxFQUFFLFFBQVEsRUFBRSxNQUFNLGNBQWMsQ0FBQTtBQUN2QyxPQUFPLElBQUksTUFBTSxXQUFXLENBQUE7QUFFNUIsTUFBTSxTQUFTLEdBQUcsUUFBUSxLQUFLLE9BQU8sQ0FBQTtBQUV0QyxPQUFPLEVBQUUsT0FBTyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFDM0MsT0FBTyxFQUFFLFdBQVcsRUFBRSxNQUFNLGdCQUFnQixDQUFBO0FBRTVDLE9BQU8sRUFBRSxZQUFZLEVBQUUsTUFBTSxlQUFlLENBQUE7QUFDNUMsTUFBTSxDQUFDLE1BQU0sTUFBTSxHQUFHLFlBQVksQ0FBQyxXQUFXLENBQStCLENBQUE7QUFDN0UsT0FBTyxFQUFFLEdBQUcsRUFBRSxNQUFNLG1CQUFtQixDQUFBO0FBQ3ZDLE1BQU0sQ0FBQyxNQUFNLFdBQVcsR0FBRyxHQUFHLEVBQUUsQ0FBQyxJQUFJLENBQUMsT0FBTyxDQUFDLElBQUksQ0FBQyxJQUFJLENBQUMsT0FBTyxDQUFDLFNBQVMsQ0FBQyxDQUFDLENBQUMsVUFBVSxDQUFDLENBQUMsQ0FBQyxNQUFNLENBQUMsRUFBRSxJQUFJLFdBQVcsRUFBRSxDQUFDLENBQUMsQ0FBQTtBQUVySCxNQUFNLENBQUMsTUFBTSxtQkFBbUIsR0FBMkI7SUFDekQsSUFBSSxFQUFFLFVBQVU7SUFDaEIsTUFBTSxFQUFFLFlBQVk7SUFDcEIsS0FBSyxFQUFFLGVBQWU7SUFDdEIsTUFBTSxFQUFFLFVBQVU7SUFDbEIsTUFBTSxFQUFFLG9CQUFvQjtJQUM1QixPQUFPLEVBQUUseUVBQXlFO0lBQ2xGLEtBQUssRUFBRSxlQUFlO0lBQ3RCLE9BQU8sRUFBRSxXQUFXO0lBQ3BCLE9BQU8sRUFBRSxhQUFhO0lBQ3RCLEtBQUssRUFBRSxpQkFBaUI7SUFDeEIsT0FBTyxFQUFFLGtCQUFrQjtJQUMzQixLQUFLLEVBQUUsZUFBZTtJQUN0QixNQUFNLEVBQUUsaUJBQWlCO0lBQ3pCLE1BQU0sRUFBRSxZQUFZO0lBQ3BCLE9BQU8sRUFBRSwyRUFBMkU7SUFDcEYsS0FBSyxFQUFFLGVBQWU7SUFDdEIsS0FBSyxFQUFFLGFBQWE7SUFDcEIsS0FBSyxFQUFFLGtCQUFrQjtJQUN6QixNQUFNLEVBQUUsWUFBWTtJQUNwQixLQUFLLEVBQUUsd0JBQXdCO0lBQy9CLE1BQU0sRUFBRSxZQUFZO0NBQ3JCLENBQUE7QUFDRCxjQUFjLFlBQVksQ0FBQTtBQUMxQixjQUFjLGdCQUFnQixDQUFBO0FBQzlCLGNBQWMsaUJBQWlCLENBQUE7QUFFL0IsT0FBTyxFQUFFLGNBQWMsRUFBRSxNQUFNLGlDQUFpQyxDQUFBO0FBQ2hFLE9BQU8sRUFBRSxrQkFBa0IsRUFBRSxNQUFNLHFDQUFxQyxDQUFBO0FBQ3hFLE9BQU8sRUFBRSx1QkFBdUIsRUFBRSxNQUFNLDBDQUEwQyxDQUFBIn0=

View File

@ -102,7 +102,7 @@ export const OptionsSchema = (opts) => {
${chalk.green.bold('custom')}: custom mode
`))
.add('logLevel', z.number()
.default(2)
.default(4)
.describe('Logging level for the application'))
.add('profile', z.string()
.optional()

View File

@ -3,7 +3,7 @@
"messages": [
{
"role": "user",
"content": "divide 15 by 3. Return only the number, no explanation."
"content": "Generate 5 SEO keywords for this text: \"Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed.\" Return only the keywords separated by commas, no explanation."
},
{
"role": "user",

View File

@ -20,6 +20,7 @@
"test:basic": "vitest run tests/unit/basic.test.ts",
"test:math": "vitest run tests/unit/math.test.ts",
"test:format": "vitest run tests/unit/format.test.ts",
"test:seo": "vitest run tests/unit/seo.test.ts",
"test:language": "vitest run tests/unit/language.test.ts",
"test2:watch": "vitest",
"test2:coverage": "vitest run --coverage",

View File

@ -11,7 +11,9 @@ import { ChatCompletionMessageParam } from 'openai/resources/index.mjs'
import { IKBotTask } from '@polymech/ai-tools'
import { logger } from '../index.js'
import { createLogger } from '@polymech/log'
import { Logger, ILogObj } from 'tslog'
import { createClient } from '../client.js'
import { OptionsSchema } from '../zod_schema.js'
import { get } from '../source.js'
@ -30,6 +32,9 @@ import { all } from '../models/index.js'
export const processRun = async (opts: IKBotTask) => {
let options: IKBotTask = null
const logger = new Logger<ILogObj>({
minLevel: opts.logLevel
})
const target = path.resolve(opts.output || opts.path)
if (!exists(target)) {
dir(target)
@ -84,7 +89,7 @@ export const processRun = async (opts: IKBotTask) => {
const logDir = path.resolve(resolve(opts.logs))
const paramsPath = path.join(logDir, 'params.json')
write(paramsPath, JSON.stringify({ ...params }, null, 2))
logger.debug(`Read ${files.length} files from project ${path.resolve(options.path)} with ${options.include}`, files.map(f => f.path), params.tools.map(t => `${t.function.name} : ${t.function.description}`))
logger.debug(`kbot run ${options.mode} : ${options.model} @ ${options.router} : ${files.length} files from project ${path.resolve(options.path)} with ${options.include}`, files.map(f => f.path), params.tools.map(t => `${t.function.name} : ${t.function.description}`))
let ret = null
options = await options.onRun(options) || options
try {
@ -164,6 +169,9 @@ function flattenArrays<T>(arrays: T[][]): T[] {
export const run = async (opts: IKBotTask) => {
const ret = []
const logger = new Logger<ILogObj>({
minLevel: opts.logLevel
})
if (opts.include) {
if (isString(opts.include)) {
opts.include = [opts.include]

View File

@ -1,13 +1,13 @@
import { platform } from 'node:process'
import path from 'node:path'
import { Logger, ILogObj } from 'tslog'
const isWindows = platform === 'win32'
import { createLogger } from '@polymech/log'
import { get_var } from '@polymech/commons'
import { MODULE_NAME } from './constants.js'
export const logger: any = createLogger('llm-tools')
import { createLogger } from '@polymech/log'
export const logger = createLogger('llm-tools') as unknown as Logger<ILogObj>
export { run } from './commands/run.js'
export const module_root = () => path.resolve(path.join(get_var(isWindows ? 'HOMEPATH' : 'HOME'), `.${MODULE_NAME}`))

View File

@ -158,7 +158,7 @@ export const OptionsSchema = (opts?: any): any => {
.add(
'logLevel',
z.number()
.default(2)
.default(4)
.describe('Logging level for the application')
)
.add(

View File

@ -23,7 +23,7 @@ export const isOpenRouterModel = (model: string): boolean => {
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
export const TEST_TIMEOUT = 5000 // 30 seconds timeout for API calls
// Report paths configuration
export const REPORTS_DIR = path.resolve(__dirname, './reports')
@ -127,6 +127,7 @@ export const runTest = async (
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
logLevel:4,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.model as string
@ -277,27 +278,23 @@ export const generateTestReport = (
// Add highscore section
report += '## Highscores\n\n'
// Create a table header
// Add regular test rankings
report += '### Performance Rankings (Duration)\n\n'
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
report += '|------|-------|--------------|--------------|\n'
// Sort all results by duration
const allResults = Array.from(latestResults.entries())
.flatMap(([testName, modelResults]) =>
Array.from(modelResults.entries())
.map(([model, result]) => ({
test: testName,
model,
duration: result.duration || 0
}))
)
.sort((a, b) => a.duration - b.duration)
Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({
model,
duration: result.duration || 0
}))
.sort((a, b) => a.duration - b.duration)
// Add all results to the table
allResults.forEach(({ test, model, duration }) => {
report += `| ${test} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
sortedResults.forEach(({ model, duration }) => {
report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
})
})
report += '\n'
// Add summary section
@ -323,7 +320,7 @@ export const generateTestReport = (
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
@ -353,7 +350,7 @@ export const generateTestReport = (
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}

File diff suppressed because it is too large Load Diff

View File

@ -743,6 +743,260 @@
"passed": true,
"duration": 1790,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "anthropic/claude-3.5-sonnet",
"router": "anthropic/claude-3.5-sonnet",
"timestamp": "2025-04-02T13:44:06.429Z",
"passed": true,
"duration": 1689,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "qwen/qwq-32b",
"router": "qwen/qwq-32b",
"timestamp": "2025-04-02T13:44:10.240Z",
"passed": true,
"duration": 3807,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-02T13:44:11.128Z",
"passed": true,
"duration": 885,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-02T13:44:21.587Z",
"passed": true,
"duration": 10455,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "deepseek/deepseek-r1",
"router": "deepseek/deepseek-r1",
"timestamp": "2025-04-02T13:44:33.654Z",
"passed": true,
"duration": 12064,
"category": "basic"
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"The sum of 5 and 3 is 8. Therefore, the result is \\boxed{8}."
],
"expected": "8",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-02T13:44:40.062Z",
"passed": false,
"duration": 6405,
"reason": "Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \\boxed{8}.",
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "anthropic/claude-3.5-sonnet",
"router": "anthropic/claude-3.5-sonnet",
"timestamp": "2025-04-02T13:44:41.261Z",
"passed": true,
"duration": 1190,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "qwen/qwq-32b",
"router": "qwen/qwq-32b",
"timestamp": "2025-04-02T13:44:46.272Z",
"passed": true,
"duration": 5008,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-02T13:44:47.386Z",
"passed": true,
"duration": 1111,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-02T13:44:48.372Z",
"passed": true,
"duration": 984,
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24\n\n24\n\nThe result is 24.\n\n24\n\nHere's the answer: 24\n\nThe answer will be 24.\n\n24\n\n24\n\nThe product of 8 and 3 is 24.\n\n24\n\nThe answer is 24.\n\n24\n\n24\n\n24\n\nThe result is 24.\n\n24\n\nHere's the numerical result: 24\n\nThe answer is 24.\n\n24\n\n24\n\nThe answer is 24."
],
"expected": "24",
"model": "deepseek/deepseek-r1",
"router": "deepseek/deepseek-r1",
"timestamp": "2025-04-02T13:44:53.633Z",
"passed": false,
"duration": 5258,
"reason": "Expected 24, but got 24\n\n24\n\nthe result is 24.\n\n24\n\nhere's the answer: 24\n\nthe answer will be 24.\n\n24\n\n24\n\nthe product of 8 and 3 is 24.\n\n24\n\nthe answer is 24.\n\n24\n\n24\n\n24\n\nthe result is 24.\n\n24\n\nhere's the numerical result: 24\n\nthe answer is 24.\n\n24\n\n24\n\nthe answer is 24.",
"category": "basic"
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-02T13:44:55.196Z",
"passed": true,
"duration": 1558,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "anthropic/claude-3.5-sonnet",
"router": "anthropic/claude-3.5-sonnet",
"timestamp": "2025-04-02T13:44:56.604Z",
"passed": true,
"duration": 1405,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "qwen/qwq-32b",
"router": "qwen/qwq-32b",
"timestamp": "2025-04-02T13:44:57.523Z",
"passed": true,
"duration": 917,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-04-02T13:44:58.630Z",
"passed": true,
"duration": 1104,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-04-02T13:44:59.523Z",
"passed": true,
"duration": 889,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1",
"router": "deepseek/deepseek-r1",
"timestamp": "2025-04-02T13:45:06.658Z",
"passed": true,
"duration": 7130,
"category": "basic"
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
"timestamp": "2025-04-02T13:45:10.307Z",
"passed": true,
"duration": 3646,
"category": "basic"
}
],
"highscores": [
@ -751,13 +1005,13 @@
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 1376,
"duration_secs": 1.376
"duration": 885,
"duration_secs": 0.885
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1951,
"duration_secs": 1.951
"duration": 1689,
"duration_secs": 1.689
}
]
},
@ -766,13 +1020,13 @@
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 870,
"duration_secs": 0.87
"duration": 984,
"duration_secs": 0.984
},
{
"model": "anthropic/claude-3.5-sonnet",
"duration": 1096,
"duration_secs": 1.096
"model": "openai/gpt-4o-mini",
"duration": 1111,
"duration_secs": 1.111
}
]
},
@ -781,16 +1035,16 @@
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 794,
"duration_secs": 0.794
"duration": 889,
"duration_secs": 0.889
},
{
"model": "openai/gpt-4o-mini",
"duration": 929,
"duration_secs": 0.929
"model": "qwen/qwq-32b",
"duration": 917,
"duration_secs": 0.917
}
]
}
],
"lastUpdated": "2025-04-02T11:26:40.358Z"
"lastUpdated": "2025-04-02T13:45:10.308Z"
}

View File

@ -4,36 +4,130 @@
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| division | openai/gpt-3.5-turbo | 794 | 0.79 |
| multiplication | openai/gpt-3.5-turbo | 870 | 0.87 |
| division | openai/gpt-4o-mini | 929 | 0.93 |
| multiplication | anthropic/claude-3.5-sonnet | 1096 | 1.10 |
| division | anthropic/claude-3.5-sonnet | 1276 | 1.28 |
| addition | openai/gpt-4o-mini | 1376 | 1.38 |
| multiplication | openai/gpt-4o-mini | 1444 | 1.44 |
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 1790 | 1.79 |
| addition | anthropic/claude-3.5-sonnet | 1951 | 1.95 |
| multiplication | qwen/qwq-32b | 3592 | 3.59 |
| addition | qwen/qwq-32b | 3726 | 3.73 |
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 5646 | 5.65 |
| division | qwen/qwq-32b | 5768 | 5.77 |
| addition | openai/gpt-3.5-turbo | 7188 | 7.19 |
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 7531 | 7.53 |
| multiplication | deepseek/deepseek-r1 | 10983 | 10.98 |
| addition | deepseek/deepseek-r1 | 15157 | 15.16 |
| division | deepseek/deepseek-r1 | 15409 | 15.41 |
| addition | openai/gpt-4o-mini | 885 | 0.89 |
| division | openai/gpt-3.5-turbo | 889 | 0.89 |
| division | qwen/qwq-32b | 917 | 0.92 |
| multiplication | openai/gpt-3.5-turbo | 984 | 0.98 |
| division | openai/gpt-4o-mini | 1104 | 1.10 |
| multiplication | openai/gpt-4o-mini | 1111 | 1.11 |
| multiplication | anthropic/claude-3.5-sonnet | 1190 | 1.19 |
| division | anthropic/claude-3.5-sonnet | 1405 | 1.41 |
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1558 | 1.56 |
| addition | anthropic/claude-3.5-sonnet | 1689 | 1.69 |
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 3646 | 3.65 |
| addition | qwen/qwq-32b | 3807 | 3.81 |
| multiplication | qwen/qwq-32b | 5008 | 5.01 |
| division | deepseek/deepseek-r1 | 7130 | 7.13 |
| addition | openai/gpt-3.5-turbo | 10455 | 10.46 |
| addition | deepseek/deepseek-r1 | 12064 | 12.06 |
## Summary
- Total Tests: 18
- Passed: 18
- Failed: 0
- Success Rate: 100.00%
- Average Duration: 4807ms (4.81s)
- Passed: 16
- Failed: 2
- Success Rate: 88.89%
- Average Duration: 3639ms (3.64s)
## Failed Tests
*No failed tests*
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `The sum of 5 and 3 is 8. Therefore, the result is \boxed{8}.`
- Duration: 6405ms (6405.00s)
- Reason: Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \boxed{8}.
- Timestamp: 4/2/2025, 3:44:40 PM
### multiplication - deepseek/deepseek-r1
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24
24
The result is 24.
24
Here's the answer: 24
The answer will be 24.
24
24
The product of 8 and 3 is 24.
24
The answer is 24.
24
24
24
The result is 24.
24
Here's the numerical result: 24
The answer is 24.
24
24
The answer is 24.`
- Duration: 5258ms (5258.00s)
- Reason: Expected 24, but got 24
24
the result is 24.
24
here's the answer: 24
the answer will be 24.
24
24
the product of 8 and 3 is 24.
24
the answer is 24.
24
24
24
the result is 24.
24
here's the numerical result: 24
the answer is 24.
24
24
the answer is 24.
- Timestamp: 4/2/2025, 3:44:53 PM
## Passed Tests
@ -42,142 +136,126 @@
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1951ms (1951.00s)
- Timestamp: 4/2/2025, 1:25:15 PM
- Duration: 1689ms (1689.00s)
- Timestamp: 4/2/2025, 3:44:06 PM
### addition - qwen/qwq-32b
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 3726ms (3726.00s)
- Timestamp: 4/2/2025, 1:25:19 PM
- Duration: 3807ms (3807.00s)
- Timestamp: 4/2/2025, 3:44:10 PM
### addition - openai/gpt-4o-mini
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1376ms (1376.00s)
- Timestamp: 4/2/2025, 1:25:20 PM
- Duration: 885ms (885.00s)
- Timestamp: 4/2/2025, 3:44:11 PM
### addition - openai/gpt-3.5-turbo
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 7188ms (7188.00s)
- Timestamp: 4/2/2025, 1:25:28 PM
- Duration: 10455ms (10455.00s)
- Timestamp: 4/2/2025, 3:44:21 PM
### addition - deepseek/deepseek-r1
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 15157ms (15157.00s)
- Timestamp: 4/2/2025, 1:25:43 PM
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 7531ms (7531.00s)
- Timestamp: 4/2/2025, 1:25:50 PM
- Duration: 12064ms (12064.00s)
- Timestamp: 4/2/2025, 3:44:33 PM
### multiplication - anthropic/claude-3.5-sonnet
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1096ms (1096.00s)
- Timestamp: 4/2/2025, 1:25:51 PM
- Duration: 1190ms (1190.00s)
- Timestamp: 4/2/2025, 3:44:41 PM
### multiplication - qwen/qwq-32b
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 3592ms (3592.00s)
- Timestamp: 4/2/2025, 1:25:55 PM
- Duration: 5008ms (5008.00s)
- Timestamp: 4/2/2025, 3:44:46 PM
### multiplication - openai/gpt-4o-mini
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 1444ms (1444.00s)
- Timestamp: 4/2/2025, 1:25:56 PM
- Duration: 1111ms (1111.00s)
- Timestamp: 4/2/2025, 3:44:47 PM
### multiplication - openai/gpt-3.5-turbo
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 870ms (870.00s)
- Timestamp: 4/2/2025, 1:25:57 PM
### multiplication - deepseek/deepseek-r1
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 10983ms (10983.00s)
- Timestamp: 4/2/2025, 1:26:08 PM
- Duration: 984ms (984.00s)
- Timestamp: 4/2/2025, 3:44:48 PM
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Duration: 5646ms (5646.00s)
- Timestamp: 4/2/2025, 1:26:14 PM
- Duration: 1558ms (1558.00s)
- Timestamp: 4/2/2025, 3:44:55 PM
### division - anthropic/claude-3.5-sonnet
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1276ms (1276.00s)
- Timestamp: 4/2/2025, 1:26:15 PM
- Duration: 1405ms (1405.00s)
- Timestamp: 4/2/2025, 3:44:56 PM
### division - qwen/qwq-32b
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 5768ms (5768.00s)
- Timestamp: 4/2/2025, 1:26:21 PM
- Duration: 917ms (917.00s)
- Timestamp: 4/2/2025, 3:44:57 PM
### division - openai/gpt-4o-mini
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 929ms (929.00s)
- Timestamp: 4/2/2025, 1:26:22 PM
- Duration: 1104ms (1104.00s)
- Timestamp: 4/2/2025, 3:44:58 PM
### division - openai/gpt-3.5-turbo
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 794ms (794.00s)
- Timestamp: 4/2/2025, 1:26:23 PM
- Duration: 889ms (889.00s)
- Timestamp: 4/2/2025, 3:44:59 PM
### division - deepseek/deepseek-r1
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 15409ms (15409.00s)
- Timestamp: 4/2/2025, 1:26:38 PM
- Duration: 7130ms (7130.00s)
- Timestamp: 4/2/2025, 3:45:06 PM
### division - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Duration: 1790ms (1790.00s)
- Timestamp: 4/2/2025, 1:26:40 PM
- Duration: 3646ms (3646.00s)
- Timestamp: 4/2/2025, 3:45:10 PM

File diff suppressed because it is too large Load Diff

View File

@ -1,38 +1,39 @@
# Math Operations Test Results
## Highscores
| Test | Model | Duration (ms) | Duration (s) |
|------|-------|--------------|--------------|
| power | openai/gpt-3.5-turbo | 719 | 0.72 |
| power | openai/gpt-4o-mini | 743 | 0.74 |
| square_root | openai/gpt-3.5-turbo | 751 | 0.75 |
| square_root | anthropic/claude-3.5-sonnet | 756 | 0.76 |
| factorial | openai/gpt-3.5-turbo | 782 | 0.78 |
| square_root | openai/gpt-4o-mini | 785 | 0.79 |
| fibonacci | openai/gpt-3.5-turbo | 800 | 0.80 |
| factorial | openai/gpt-4o-mini | 872 | 0.87 |
| quadratic | openai/gpt-3.5-turbo | 878 | 0.88 |
| factorial | anthropic/claude-3.5-sonnet | 970 | 0.97 |
| fibonacci | openai/gpt-4o-mini | 974 | 0.97 |
| quadratic | openai/gpt-4o-mini | 994 | 0.99 |
| power | anthropic/claude-3.5-sonnet | 1241 | 1.24 |
| quadratic | anthropic/claude-3.5-sonnet | 1650 | 1.65 |
| fibonacci | anthropic/claude-3.5-sonnet | 3791 | 3.79 |
| factorial | qwen/qwq-32b | 4954 | 4.95 |
| power | qwen/qwq-32b | 6255 | 6.25 |
| square_root | qwen/qwq-32b | 6745 | 6.75 |
| quadratic | qwen/qwq-32b | 10222 | 10.22 |
| fibonacci | qwen/qwq-32b | 12322 | 12.32 |
| factorial | openai/gpt-3.5-turbo | 827 | 0.83 |
| factorial | openai/gpt-4o-mini | 956 | 0.96 |
| square_root | openai/gpt-4o-mini | 964 | 0.96 |
| square_root | openai/gpt-3.5-turbo | 1080 | 1.08 |
| power | anthropic/claude-3.5-sonnet | 1136 | 1.14 |
| power | openai/gpt-4o-mini | 1259 | 1.26 |
| power | openai/gpt-3.5-turbo | 1498 | 1.50 |
| fibonacci | openai/gpt-3.5-turbo | 1543 | 1.54 |
| fibonacci | openai/gpt-4o-mini | 1673 | 1.67 |
| factorial | anthropic/claude-3.5-sonnet | 1853 | 1.85 |
| fibonacci | anthropic/claude-3.5-sonnet | 2004 | 2.00 |
| square_root | anthropic/claude-3.5-sonnet | 2012 | 2.01 |
| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 4814 | 4.81 |
| power | deepseek/deepseek-r1 | 5414 | 5.41 |
| square_root | qwen/qwq-32b | 5888 | 5.89 |
| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 6114 | 6.11 |
| quadratic | qwen/qwq-32b | 6795 | 6.79 |
| factorial | qwen/qwq-32b | 6892 | 6.89 |
| power | qwen/qwq-32b | 7572 | 7.57 |
| power | deepseek/deepseek-r1-distill-qwen-14b:free | 9891 | 9.89 |
| square_root | deepseek/deepseek-r1 | 10309 | 10.31 |
| factorial | deepseek/deepseek-r1 | 11193 | 11.19 |
## Summary
- Total Tests: 20
- Passed: 14
- Failed: 6
- Success Rate: 70.00%
- Average Duration: 2860ms (2.86s)
- Total Tests: 29
- Passed: 22
- Failed: 7
- Success Rate: 75.86%
- Average Duration: 4745ms (4.75s)
## Failed Tests
@ -41,54 +42,67 @@
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2,-3`
- Duration: 1650ms (1650.00s)
- Duration: 1892ms (1892.00s)
- Reason: Expected -3,-2, but got -2,-3
- Timestamp: 4/2/2025, 1:22:10 PM
- Timestamp: 4/2/2025, 3:32:51 PM
### quadratic - openai/gpt-4o-mini
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2, -3`
- Duration: 994ms (994.00s)
- Duration: 853ms (853.00s)
- Reason: Expected -3,-2, but got -2, -3
- Timestamp: 4/2/2025, 1:22:21 PM
- Timestamp: 4/2/2025, 3:32:59 PM
### quadratic - openai/gpt-3.5-turbo
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2, -3`
- Duration: 878ms (878.00s)
- Duration: 832ms (832.00s)
- Reason: Expected -3,-2, but got -2, -3
- Timestamp: 4/2/2025, 1:22:22 PM
- Timestamp: 4/2/2025, 3:32:59 PM
### quadratic - deepseek/deepseek-r1
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-2, -3`
- Duration: 19850ms (19850.00s)
- Reason: Expected -3,-2, but got -2, -3
- Timestamp: 4/2/2025, 3:33:19 PM
### quadratic - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are x = -2 and x = -3.
-2,-3`
- Duration: 15811ms (15811.00s)
- Reason: Expected -3,-2, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are x = -2 and x = -3.
-2,-3
- Timestamp: 4/2/2025, 3:33:35 PM
### fibonacci - qwen/qwq-32b
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `5`
- Duration: 12322ms (12322.00s)
- Duration: 1509ms (1509.00s)
- Reason: Expected 8, but got 5
- Timestamp: 4/2/2025, 1:22:46 PM
- Timestamp: 4/2/2025, 3:34:05 PM
### fibonacci - openai/gpt-4o-mini
### fibonacci - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `5`
- Duration: 974ms (974.00s)
- Duration: 5171ms (5171.00s)
- Reason: Expected 8, but got 5
- Timestamp: 4/2/2025, 1:22:47 PM
### power - qwen/qwq-32b
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: ``
- Duration: 6255ms (6255.00s)
- Reason: Model returned empty response
- Timestamp: 4/2/2025, 1:23:04 PM
- Timestamp: 4/2/2025, 3:34:44 PM
## Passed Tests
@ -97,110 +111,174 @@
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
- Expected: `-3,-2`
- Actual: `-3,-2`
- Duration: 10222ms (10222.00s)
- Timestamp: 4/2/2025, 1:22:20 PM
- Duration: 6795ms (6795.00s)
- Timestamp: 4/2/2025, 3:32:58 PM
### factorial - anthropic/claude-3.5-sonnet
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 970ms (970.00s)
- Timestamp: 4/2/2025, 1:22:23 PM
- Duration: 1853ms (1853.00s)
- Timestamp: 4/2/2025, 3:33:37 PM
### factorial - qwen/qwq-32b
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 4954ms (4954.00s)
- Timestamp: 4/2/2025, 1:22:28 PM
- Duration: 6892ms (6892.00s)
- Timestamp: 4/2/2025, 3:33:44 PM
### factorial - openai/gpt-4o-mini
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 872ms (872.00s)
- Timestamp: 4/2/2025, 1:22:29 PM
- Duration: 956ms (956.00s)
- Timestamp: 4/2/2025, 3:33:45 PM
### factorial - openai/gpt-3.5-turbo
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 782ms (782.00s)
- Timestamp: 4/2/2025, 1:22:30 PM
- Duration: 827ms (827.00s)
- Timestamp: 4/2/2025, 3:33:46 PM
### factorial - deepseek/deepseek-r1
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 11193ms (11193.00s)
- Timestamp: 4/2/2025, 3:33:57 PM
### factorial - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Duration: 4814ms (4814.00s)
- Timestamp: 4/2/2025, 3:34:02 PM
### fibonacci - anthropic/claude-3.5-sonnet
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 3791ms (3791.00s)
- Timestamp: 4/2/2025, 1:22:33 PM
- Duration: 2004ms (2004.00s)
- Timestamp: 4/2/2025, 3:34:04 PM
### fibonacci - openai/gpt-4o-mini
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1673ms (1673.00s)
- Timestamp: 4/2/2025, 3:34:07 PM
### fibonacci - openai/gpt-3.5-turbo
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 800ms (800.00s)
- Timestamp: 4/2/2025, 1:22:48 PM
- Duration: 1543ms (1543.00s)
- Timestamp: 4/2/2025, 3:34:08 PM
### square_root - anthropic/claude-3.5-sonnet
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 756ms (756.00s)
- Timestamp: 4/2/2025, 1:22:48 PM
- Duration: 2012ms (2012.00s)
- Timestamp: 4/2/2025, 3:34:46 PM
### square_root - qwen/qwq-32b
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 6745ms (6745.00s)
- Timestamp: 4/2/2025, 1:22:55 PM
- Duration: 5888ms (5888.00s)
- Timestamp: 4/2/2025, 3:34:52 PM
### square_root - openai/gpt-4o-mini
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 785ms (785.00s)
- Timestamp: 4/2/2025, 1:22:56 PM
- Duration: 964ms (964.00s)
- Timestamp: 4/2/2025, 3:34:52 PM
### square_root - openai/gpt-3.5-turbo
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 751ms (751.00s)
- Timestamp: 4/2/2025, 1:22:57 PM
- Duration: 1080ms (1080.00s)
- Timestamp: 4/2/2025, 3:34:54 PM
### square_root - deepseek/deepseek-r1
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 10309ms (10309.00s)
- Timestamp: 4/2/2025, 3:35:04 PM
### square_root - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
- Expected: `4`
- Actual: `4`
- Duration: 6114ms (6114.00s)
- Timestamp: 4/2/2025, 3:35:10 PM
### power - anthropic/claude-3.5-sonnet
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 1241ms (1241.00s)
- Timestamp: 4/2/2025, 1:22:58 PM
- Duration: 1136ms (1136.00s)
- Timestamp: 4/2/2025, 3:35:11 PM
### power - qwen/qwq-32b
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 7572ms (7572.00s)
- Timestamp: 4/2/2025, 3:35:19 PM
### power - openai/gpt-4o-mini
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 743ms (743.00s)
- Timestamp: 4/2/2025, 1:23:05 PM
- Duration: 1259ms (1259.00s)
- Timestamp: 4/2/2025, 3:35:20 PM
### power - openai/gpt-3.5-turbo
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 719ms (719.00s)
- Timestamp: 4/2/2025, 1:23:06 PM
- Duration: 1498ms (1498.00s)
- Timestamp: 4/2/2025, 3:35:21 PM
### power - deepseek/deepseek-r1
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 5414ms (5414.00s)
- Timestamp: 4/2/2025, 3:35:27 PM
### power - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Duration: 9891ms (9891.00s)
- Timestamp: 4/2/2025, 3:35:37 PM

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,124 @@
# SEO Keyword Generation Test Results
## Model Rankings
| Model | Avg Matches | Total Matches | Tests Run |
|-------|-------------|---------------|-----------|
| anthropic/claude-3.5-sonnet | 3.50 | 7/10 | 2 |
| openai/gpt-4o-mini | 3.50 | 7/10 | 2 |
| openai/gpt-3.5-turbo | 3.50 | 7/10 | 2 |
## Summary
- Total Tests: 12
- Passed: 6
- Failed: 6
- Success Rate: 50.00%
- Average Duration: 3222ms (3.22s)
## Failed Tests
### seo_keywords_text - qwen/qwq-32b
- Prompt: `Generate SEO keywords for descriptive text`
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
- Actual: ``
- Duration: 5000ms (5.00s)
- Reason: API call timed out
- Timestamp: 4/2/2025, 4:25:03 PM
### seo_keywords_text - deepseek/deepseek-r1
- Prompt: `Generate SEO keywords for descriptive text`
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
- Actual: ``
- Duration: 5000ms (5.00s)
- Reason: API call timed out
- Timestamp: 4/2/2025, 4:25:10 PM
### seo_keywords_text - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Generate SEO keywords for descriptive text`
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
- Actual: ``
- Duration: 5000ms (5.00s)
- Reason: API call timed out
- Timestamp: 4/2/2025, 4:25:15 PM
### seo_keywords_technical - qwen/qwq-32b
- Prompt: `Generate SEO keywords for technical text`
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
- Actual: ``
- Duration: 5000ms (5.00s)
- Reason: API call timed out
- Timestamp: 4/2/2025, 4:25:22 PM
### seo_keywords_technical - deepseek/deepseek-r1
- Prompt: `Generate SEO keywords for technical text`
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
- Actual: ``
- Duration: 5000ms (5.00s)
- Reason: API call timed out
- Timestamp: 4/2/2025, 4:25:30 PM
### seo_keywords_technical - deepseek/deepseek-r1-distill-qwen-14b:free
- Prompt: `Generate SEO keywords for technical text`
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
- Actual: ``
- Duration: 5000ms (5.00s)
- Reason: API call timed out
- Timestamp: 4/2/2025, 4:25:35 PM
## Passed Tests
### seo_keywords_text - anthropic/claude-3.5-sonnet
- Prompt: `Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.`
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
- Actual: `ancient Rome, Colosseum tourism, Roman Forum attractions, Vatican City sightseeing, historic Rome landmarks`
- Duration: 1719ms (1.72s)
- Timestamp: 4/2/2025, 4:24:58 PM
### seo_keywords_text - openai/gpt-4o-mini
- Prompt: `Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.`
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
- Actual: `ancient Rome, Colosseum, Roman Forum, Vatican City, history art culture`
- Duration: 1539ms (1.54s)
- Timestamp: 4/2/2025, 4:25:04 PM
### seo_keywords_text - openai/gpt-3.5-turbo
- Prompt: `Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.`
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
- Actual: `ancient city of Rome, Colosseum, Roman Forum, Vatican City, history, art, culture`
- Duration: 1002ms (1.00s)
- Timestamp: 4/2/2025, 4:25:05 PM
### seo_keywords_technical - anthropic/claude-3.5-sonnet
- Prompt: `Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.`
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
- Actual: `machine learning algorithms, statistical learning, data science, automated learning, predictive modeling`
- Duration: 1830ms (1.83s)
- Timestamp: 4/2/2025, 4:25:17 PM
### seo_keywords_technical - openai/gpt-4o-mini
- Prompt: `Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.`
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
- Actual: `machine learning algorithms, statistical methods, computer learning, data improvement, performance enhancement`
- Duration: 1694ms (1.69s)
- Timestamp: 4/2/2025, 4:25:24 PM
### seo_keywords_technical - openai/gpt-3.5-turbo
- Prompt: `Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.`
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
- Actual: `Machine learning algorithms, statistical methods, computers, data, performance.`
- Duration: 881ms (0.88s)
- Timestamp: 4/2/2025, 4:25:25 PM

View File

@ -0,0 +1,245 @@
import { describe, it, expect } from 'vitest'
import * as path from 'node:path'
import { sync as exists } from "@polymech/fs/exists"
import { sync as write } from "@polymech/fs/write"
import {
getDefaultModels,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
runTest,
getReportPaths
} from './commons'
// Optionally override models for this specific test file
const models = getDefaultModels()
describe('SEO Keyword Generation', () => {
let testResults: TestResult[] = []
const TEST_LOG_PATH = getReportPaths('seo', 'json')
const TEST_REPORT_PATH = getReportPaths('seo', 'md')
// Track model performance
const modelScores: Record<string, { total: number, tests: number }> = {}
it.each(models)('should generate SEO keywords for a descriptive text with model %s', async (modelName) => {
try {
const result = await runTest(
'Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.',
'Rome, Colosseum, Roman Forum, Vatican City, ancient history',
'seo_keywords_text',
modelName,
TEST_LOG_PATH
)
testResults.push(result)
// Handle potential empty or invalid results
if (!result.result?.[0]) {
result.passed = false
result.reason = 'No keywords generated'
return
}
const resultKeywords = result.result[0]
.trim()
.toLowerCase()
.split(',')
.map(k => k.trim())
.filter(k => k.length > 0)
const expectedKeywords = ['rome', 'colosseum', 'roman forum', 'vatican city', 'ancient history']
const matchedKeywords = expectedKeywords.filter(keyword =>
resultKeywords.some(result => result.includes(keyword))
)
// Update model score
if (!modelScores[modelName]) {
modelScores[modelName] = { total: 0, tests: 0 }
}
modelScores[modelName].total += matchedKeywords.length
modelScores[modelName].tests += 1
// Log the actual results for debugging
console.log(`Model ${modelName} generated keywords:`, resultKeywords)
console.log(`Matched keywords:`, matchedKeywords)
console.log(`Current score for ${modelName}: ${matchedKeywords.length}/${expectedKeywords.length}`)
// Update test result to reflect passing if we have at least 2 matches
result.passed = matchedKeywords.length >= 2
result.reason = result.passed ? undefined : `Only matched ${matchedKeywords.length} keywords, expected at least 2`
} catch (error) {
// Handle timeout or other errors gracefully
const result: TestResult = {
test: 'seo_keywords_text',
prompt: 'Generate SEO keywords for descriptive text',
result: [],
expected: 'Rome, Colosseum, Roman Forum, Vatican City, ancient history',
model: modelName,
router: 'openrouter',
timestamp: new Date().toISOString(),
passed: false,
duration: TEST_TIMEOUT,
reason: error instanceof Error ? error.message : 'Unknown error occurred'
}
testResults.push(result)
}
}, { timeout: TEST_TIMEOUT })
it.each(models)('should generate SEO keywords for a technical text with model %s', async (modelName) => {
try {
const result = await runTest(
'Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.',
'machine learning, algorithms, artificial intelligence, data science, statistical methods',
'seo_keywords_technical',
modelName,
TEST_LOG_PATH
)
testResults.push(result)
// Handle potential empty or invalid results
if (!result.result?.[0]) {
result.passed = false
result.reason = 'No keywords generated'
return
}
const resultKeywords = result.result[0]
.trim()
.toLowerCase()
.split(',')
.map(k => k.trim())
.filter(k => k.length > 0)
const expectedKeywords = ['machine learning', 'algorithms', 'artificial intelligence', 'data science', 'statistical methods']
const matchedKeywords = expectedKeywords.filter(keyword =>
resultKeywords.some(result => result.includes(keyword))
)
// Update model score
if (!modelScores[modelName]) {
modelScores[modelName] = { total: 0, tests: 0 }
}
modelScores[modelName].total += matchedKeywords.length
modelScores[modelName].tests += 1
// Log the actual results for debugging
console.log(`Model ${modelName} generated keywords:`, resultKeywords)
console.log(`Matched keywords:`, matchedKeywords)
console.log(`Current score for ${modelName}: ${matchedKeywords.length}/${expectedKeywords.length}`)
// Update test result to reflect passing if we have at least 2 matches
result.passed = matchedKeywords.length >= 2
result.reason = result.passed ? undefined : `Only matched ${matchedKeywords.length} keywords, expected at least 2`
} catch (error) {
// Handle timeout or other errors gracefully
const result: TestResult = {
test: 'seo_keywords_technical',
prompt: 'Generate SEO keywords for technical text',
result: [],
expected: 'machine learning, algorithms, artificial intelligence, data science, statistical methods',
model: modelName,
router: 'openrouter',
timestamp: new Date().toISOString(),
passed: false,
duration: TEST_TIMEOUT,
reason: error instanceof Error ? error.message : 'Unknown error occurred'
}
testResults.push(result)
}
}, { timeout: TEST_TIMEOUT })
it('should generate markdown report with model rankings', () => {
// Calculate average scores and sort models
const modelRankings = Object.entries(modelScores)
.map(([model, scores]) => ({
model,
averageScore: scores.total / scores.tests,
totalMatches: scores.total,
testsRun: scores.tests,
maxPossibleMatches: scores.tests * 5 // Each test has 5 expected keywords
}))
.sort((a, b) => b.averageScore - a.averageScore)
// Log the rankings
console.log('\nSEO Keyword Generation Rankings:')
console.log('--------------------------------')
modelRankings.forEach((ranking, index) => {
console.log(`${index + 1}. ${ranking.model}:`)
console.log(` Average matches per test: ${ranking.averageScore.toFixed(2)}`)
console.log(` Total matches: ${ranking.totalMatches}/${ranking.maxPossibleMatches}`)
console.log(` Tests run: ${ranking.testsRun}`)
})
// Generate markdown report
let report = `# SEO Keyword Generation Test Results\n\n`
// Add SEO rankings section
report += '## Model Rankings\n\n'
report += '| Model | Avg Matches | Total Matches | Tests Run |\n'
report += '|-------|-------------|---------------|-----------|'
modelRankings.forEach(({ model, averageScore, totalMatches, testsRun, maxPossibleMatches }) => {
report += `\n| ${model} | ${averageScore.toFixed(2)} | ${totalMatches}/${maxPossibleMatches} | ${testsRun} |`
})
report += '\n\n'
// Add summary section
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
report += '## Summary\n\n'
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
// Add failed tests section
report += '## Failed Tests\n\n'
const failedResults = testResults.filter(r => !r.passed)
if (failedResults.length === 0) {
report += '*No failed tests*\n\n'
} else {
failedResults.forEach(result => {
report += `### ${result.test} - ${result.model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
if (result.error) {
report += `- Error: ${result.error.message}\n`
}
if (result.reason) {
report += `- Reason: ${result.reason}\n`
}
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
})
}
// Add passed tests section
report += '## Passed Tests\n\n'
const passedResults = testResults.filter(r => r.passed)
if (passedResults.length === 0) {
report += '*No passed tests*\n\n'
} else {
passedResults.forEach(result => {
report += `### ${result.test} - ${result.model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
})
}
// Write report to file
write(TEST_REPORT_PATH, report)
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
})
})