kbot tests: seo
This commit is contained in:
parent
bc644f8635
commit
e51e8a5b7b
File diff suppressed because one or more lines are too long
3
packages/kbot/dist-in/index.d.ts
vendored
3
packages/kbot/dist-in/index.d.ts
vendored
@ -1,4 +1,5 @@
|
||||
export declare const logger: any;
|
||||
import { Logger, ILogObj } from 'tslog';
|
||||
export declare const logger: Logger<ILogObj>;
|
||||
export { run } from './commands/run.js';
|
||||
export declare const module_root: () => string;
|
||||
export declare const assistant_supported: Record<string, string>;
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import { platform } from 'node:process';
|
||||
import path from 'node:path';
|
||||
const isWindows = platform === 'win32';
|
||||
import { createLogger } from '@polymech/log';
|
||||
import { get_var } from '@polymech/commons';
|
||||
import { MODULE_NAME } from './constants.js';
|
||||
import { createLogger } from '@polymech/log';
|
||||
export const logger = createLogger('llm-tools');
|
||||
export { run } from './commands/run.js';
|
||||
export const module_root = () => path.resolve(path.join(get_var(isWindows ? 'HOMEPATH' : 'HOME'), `.${MODULE_NAME}`));
|
||||
@ -36,4 +36,4 @@ export * from './zod_schema.js';
|
||||
export { E_OPENAI_MODEL } from './models/cache/openai-models.js';
|
||||
export { E_OPENROUTER_MODEL } from './models/cache/openrouter-models.js';
|
||||
export { E_OPENROUTER_MODEL_FREE } from './models/cache/openrouter-models-free.js';
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUEsT0FBTyxFQUFFLFFBQVEsRUFBRSxNQUFNLGNBQWMsQ0FBQTtBQUN2QyxPQUFPLElBQUksTUFBTSxXQUFXLENBQUE7QUFFNUIsTUFBTSxTQUFTLEdBQUcsUUFBUSxLQUFLLE9BQU8sQ0FBQTtBQUV0QyxPQUFPLEVBQUUsWUFBWSxFQUFFLE1BQU0sZUFBZSxDQUFBO0FBQzVDLE9BQU8sRUFBRSxPQUFPLEVBQUUsTUFBTSxtQkFBbUIsQ0FBQTtBQUMzQyxPQUFPLEVBQUUsV0FBVyxFQUFFLE1BQU0sZ0JBQWdCLENBQUE7QUFFNUMsTUFBTSxDQUFDLE1BQU0sTUFBTSxHQUFRLFlBQVksQ0FBQyxXQUFXLENBQUMsQ0FBQTtBQUNwRCxPQUFPLEVBQUUsR0FBRyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFDdkMsTUFBTSxDQUFDLE1BQU0sV0FBVyxHQUFHLEdBQUcsRUFBRSxDQUFDLElBQUksQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLElBQUksQ0FBQyxPQUFPLENBQUMsU0FBUyxDQUFDLENBQUMsQ0FBQyxVQUFVLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQyxFQUFFLElBQUksV0FBVyxFQUFFLENBQUMsQ0FBQyxDQUFBO0FBRXJILE1BQU0sQ0FBQyxNQUFNLG1CQUFtQixHQUEyQjtJQUN6RCxJQUFJLEVBQUUsVUFBVTtJQUNoQixNQUFNLEVBQUUsWUFBWTtJQUNwQixLQUFLLEVBQUUsZUFBZTtJQUN0QixNQUFNLEVBQUUsVUFBVTtJQUNsQixNQUFNLEVBQUUsb0JBQW9CO0lBQzVCLE9BQU8sRUFBRSx5RUFBeUU7SUFDbEYsS0FBSyxFQUFFLGVBQWU7SUFDdEIsT0FBTyxFQUFFLFdBQVc7SUFDcEIsT0FBTyxFQUFFLGFBQWE7SUFDdEIsS0FBSyxFQUFFLGlCQUFpQjtJQUN4QixPQUFPLEVBQUUsa0JBQWtCO0lBQzNCLEtBQUssRUFBRSxlQUFlO0lBQ3RCLE1BQU0sRUFBRSxpQkFBaUI7SUFDekIsTUFBTSxFQUFFLFlBQVk7SUFDcEIsT0FBTyxFQUFFLDJFQUEyRTtJQUNwRixLQUFLLEVBQUUsZUFBZTtJQUN0QixLQUFLLEVBQUUsYUFBYTtJQUNwQixLQUFLLEVBQUUsa0JBQWtCO0lBQ3pCLE1BQU0sRUFBRSxZQUFZO0lBQ3BCLEtBQUssRUFBRSx3QkFBd0I7SUFDL0IsTUFBTSxFQUFFLFlBQVk7Q0FDckIsQ0FBQTtBQUNELGNBQWMsWUFBWSxDQUFBO0FBQzFCLGNBQWMsZ0JBQWdCLENBQUE7QUFDOUIsY0FBYyxpQkFBaUIsQ0FBQTtBQUUvQixPQUFPLEVBQUUsY0FBYyxFQUFFLE1BQU0saUNBQWlDLENBQUE7QUFDaEUsT0FBTyxFQUFFLGtCQUFrQixFQUFFLE1BQU0scUNBQXFDLENBQUE7QUFDeEUsT0FBTyxFQUFFLHVCQUF1QixFQUFFLE1BQU0sMENBQTBDLENBQUEifQ==
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUEsT0FBTyxFQUFFLFFBQVEsRUFBRSxNQUFNLGNBQWMsQ0FBQTtBQUN2QyxPQUFPLElBQUksTUFBTSxXQUFXLENBQUE7QUFFNUIsTUFBTSxTQUFTLEdBQUcsUUFBUSxLQUFLLE9BQU8sQ0FBQTtBQUV0QyxPQUFPLEVBQUUsT0FBTyxFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFDM0MsT0FBTyxFQUFFLFdBQVcsRUFBRSxNQUFNLGdCQUFnQixDQUFBO0FBRTVDLE9BQU8sRUFBRSxZQUFZLEVBQUUsTUFBTSxlQUFlLENBQUE7QUFDNUMsTUFBTSxDQUFDLE1BQU0sTUFBTSxHQUFHLFlBQVksQ0FBQyxXQUFXLENBQStCLENBQUE7QUFDN0UsT0FBTyxFQUFFLEdBQUcsRUFBRSxNQUFNLG1CQUFtQixDQUFBO0FBQ3ZDLE1BQU0sQ0FBQyxNQUFNLFdBQVcsR0FBRyxHQUFHLEVBQUUsQ0FBQyxJQUFJLENBQUMsT0FBTyxDQUFDLElBQUksQ0FBQyxJQUFJLENBQUMsT0FBTyxDQUFDLFNBQVMsQ0FBQyxDQUFDLENBQUMsVUFBVSxDQUFDLENBQUMsQ0FBQyxNQUFNLENBQUMsRUFBRSxJQUFJLFdBQVcsRUFBRSxDQUFDLENBQUMsQ0FBQTtBQUVySCxNQUFNLENBQUMsTUFBTSxtQkFBbUIsR0FBMkI7SUFDekQsSUFBSSxFQUFFLFVBQVU7SUFDaEIsTUFBTSxFQUFFLFlBQVk7SUFDcEIsS0FBSyxFQUFFLGVBQWU7SUFDdEIsTUFBTSxFQUFFLFVBQVU7SUFDbEIsTUFBTSxFQUFFLG9CQUFvQjtJQUM1QixPQUFPLEVBQUUseUVBQXlFO0lBQ2xGLEtBQUssRUFBRSxlQUFlO0lBQ3RCLE9BQU8sRUFBRSxXQUFXO0lBQ3BCLE9BQU8sRUFBRSxhQUFhO0lBQ3RCLEtBQUssRUFBRSxpQkFBaUI7SUFDeEIsT0FBTyxFQUFFLGtCQUFrQjtJQUMzQixLQUFLLEVBQUUsZUFBZTtJQUN0QixNQUFNLEVBQUUsaUJBQWlCO0lBQ3pCLE1BQU0sRUFBRSxZQUFZO0lBQ3BCLE9BQU8sRUFBRSwyRUFBMkU7SUFDcEYsS0FBSyxFQUFFLGVBQWU7SUFDdEIsS0FBSyxFQUFFLGFBQWE7SUFDcEIsS0FBSyxFQUFFLGtCQUFrQjtJQUN6QixNQUFNLEVBQUUsWUFBWTtJQUNwQixLQUFLLEVBQUUsd0JBQXdCO0lBQy9CLE1BQU0sRUFBRSxZQUFZO0NBQ3JCLENBQUE7QUFDRCxjQUFjLFlBQVksQ0FBQTtBQUMxQixjQUFjLGdCQUFnQixDQUFBO0FBQzlCLGNBQWMsaUJBQWlCLENBQUE7QUFFL0IsT0FBTyxFQUFFLGNBQWMsRUFBRSxNQUFNLGlDQUFpQyxDQUFBO0FBQ2hFLE9BQU8sRUFBRSxrQkFBa0IsRUFBRSxNQUFNLHFDQUFxQyxDQUFBO0FBQ3hFLE9BQU8sRUFBRSx1QkFBdUIsRUFBRSxNQUFNLDBDQUEwQyxDQUFBIn0=
|
||||
@ -102,7 +102,7 @@ export const OptionsSchema = (opts) => {
|
||||
${chalk.green.bold('custom')}: custom mode
|
||||
`))
|
||||
.add('logLevel', z.number()
|
||||
.default(2)
|
||||
.default(4)
|
||||
.describe('Logging level for the application'))
|
||||
.add('profile', z.string()
|
||||
.optional()
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "divide 15 by 3. Return only the number, no explanation."
|
||||
"content": "Generate 5 SEO keywords for this text: \"Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed.\" Return only the keywords separated by commas, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
"test:basic": "vitest run tests/unit/basic.test.ts",
|
||||
"test:math": "vitest run tests/unit/math.test.ts",
|
||||
"test:format": "vitest run tests/unit/format.test.ts",
|
||||
"test:seo": "vitest run tests/unit/seo.test.ts",
|
||||
"test:language": "vitest run tests/unit/language.test.ts",
|
||||
"test2:watch": "vitest",
|
||||
"test2:coverage": "vitest run --coverage",
|
||||
|
||||
@ -11,7 +11,9 @@ import { ChatCompletionMessageParam } from 'openai/resources/index.mjs'
|
||||
|
||||
import { IKBotTask } from '@polymech/ai-tools'
|
||||
|
||||
import { logger } from '../index.js'
|
||||
import { createLogger } from '@polymech/log'
|
||||
import { Logger, ILogObj } from 'tslog'
|
||||
|
||||
import { createClient } from '../client.js'
|
||||
import { OptionsSchema } from '../zod_schema.js'
|
||||
import { get } from '../source.js'
|
||||
@ -30,6 +32,9 @@ import { all } from '../models/index.js'
|
||||
|
||||
export const processRun = async (opts: IKBotTask) => {
|
||||
let options: IKBotTask = null
|
||||
const logger = new Logger<ILogObj>({
|
||||
minLevel: opts.logLevel
|
||||
})
|
||||
const target = path.resolve(opts.output || opts.path)
|
||||
if (!exists(target)) {
|
||||
dir(target)
|
||||
@ -84,7 +89,7 @@ export const processRun = async (opts: IKBotTask) => {
|
||||
const logDir = path.resolve(resolve(opts.logs))
|
||||
const paramsPath = path.join(logDir, 'params.json')
|
||||
write(paramsPath, JSON.stringify({ ...params }, null, 2))
|
||||
logger.debug(`Read ${files.length} files from project ${path.resolve(options.path)} with ${options.include}`, files.map(f => f.path), params.tools.map(t => `${t.function.name} : ${t.function.description}`))
|
||||
logger.debug(`kbot run ${options.mode} : ${options.model} @ ${options.router} : ${files.length} files from project ${path.resolve(options.path)} with ${options.include}`, files.map(f => f.path), params.tools.map(t => `${t.function.name} : ${t.function.description}`))
|
||||
let ret = null
|
||||
options = await options.onRun(options) || options
|
||||
try {
|
||||
@ -164,6 +169,9 @@ function flattenArrays<T>(arrays: T[][]): T[] {
|
||||
|
||||
export const run = async (opts: IKBotTask) => {
|
||||
const ret = []
|
||||
const logger = new Logger<ILogObj>({
|
||||
minLevel: opts.logLevel
|
||||
})
|
||||
if (opts.include) {
|
||||
if (isString(opts.include)) {
|
||||
opts.include = [opts.include]
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
import { platform } from 'node:process'
|
||||
import path from 'node:path'
|
||||
|
||||
import { Logger, ILogObj } from 'tslog'
|
||||
const isWindows = platform === 'win32'
|
||||
|
||||
import { createLogger } from '@polymech/log'
|
||||
import { get_var } from '@polymech/commons'
|
||||
import { MODULE_NAME } from './constants.js'
|
||||
|
||||
export const logger: any = createLogger('llm-tools')
|
||||
import { createLogger } from '@polymech/log'
|
||||
export const logger = createLogger('llm-tools') as unknown as Logger<ILogObj>
|
||||
export { run } from './commands/run.js'
|
||||
export const module_root = () => path.resolve(path.join(get_var(isWindows ? 'HOMEPATH' : 'HOME'), `.${MODULE_NAME}`))
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ export const OptionsSchema = (opts?: any): any => {
|
||||
.add(
|
||||
'logLevel',
|
||||
z.number()
|
||||
.default(2)
|
||||
.default(4)
|
||||
.describe('Logging level for the application')
|
||||
)
|
||||
.add(
|
||||
|
||||
@ -23,7 +23,7 @@ export const isOpenRouterModel = (model: string): boolean => {
|
||||
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
||||
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
||||
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
|
||||
export const TEST_TIMEOUT = 5000 // 30 seconds timeout for API calls
|
||||
|
||||
// Report paths configuration
|
||||
export const REPORTS_DIR = path.resolve(__dirname, './reports')
|
||||
@ -127,6 +127,7 @@ export const runTest = async (
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
logLevel:4,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.model as string
|
||||
@ -277,27 +278,23 @@ export const generateTestReport = (
|
||||
// Add highscore section
|
||||
report += '## Highscores\n\n'
|
||||
|
||||
// Create a table header
|
||||
// Add regular test rankings
|
||||
report += '### Performance Rankings (Duration)\n\n'
|
||||
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
|
||||
report += '|------|-------|--------------|--------------|\n'
|
||||
|
||||
// Sort all results by duration
|
||||
const allResults = Array.from(latestResults.entries())
|
||||
.flatMap(([testName, modelResults]) =>
|
||||
Array.from(modelResults.entries())
|
||||
.map(([model, result]) => ({
|
||||
test: testName,
|
||||
model,
|
||||
duration: result.duration || 0
|
||||
}))
|
||||
)
|
||||
.sort((a, b) => a.duration - b.duration)
|
||||
Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
|
||||
const sortedResults = Array.from(modelResults.entries())
|
||||
.map(([model, result]) => ({
|
||||
model,
|
||||
duration: result.duration || 0
|
||||
}))
|
||||
.sort((a, b) => a.duration - b.duration)
|
||||
|
||||
// Add all results to the table
|
||||
allResults.forEach(({ test, model, duration }) => {
|
||||
report += `| ${test} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
|
||||
sortedResults.forEach(({ model, duration }) => {
|
||||
report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
|
||||
})
|
||||
})
|
||||
|
||||
report += '\n'
|
||||
|
||||
// Add summary section
|
||||
@ -323,7 +320,7 @@ export const generateTestReport = (
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
|
||||
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
@ -353,7 +350,7 @@ export const generateTestReport = (
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms (${(result.duration || 0 / 1000).toFixed(2)}s)\n`
|
||||
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -743,6 +743,260 @@
|
||||
"passed": true,
|
||||
"duration": 1790,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "anthropic/claude-3.5-sonnet",
|
||||
"timestamp": "2025-04-02T13:44:06.429Z",
|
||||
"passed": true,
|
||||
"duration": 1689,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "qwen/qwq-32b",
|
||||
"timestamp": "2025-04-02T13:44:10.240Z",
|
||||
"passed": true,
|
||||
"duration": 3807,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T13:44:11.128Z",
|
||||
"passed": true,
|
||||
"duration": 885,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T13:44:21.587Z",
|
||||
"passed": true,
|
||||
"duration": 10455,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1",
|
||||
"router": "deepseek/deepseek-r1",
|
||||
"timestamp": "2025-04-02T13:44:33.654Z",
|
||||
"passed": true,
|
||||
"duration": 12064,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"The sum of 5 and 3 is 8. Therefore, the result is \\boxed{8}."
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T13:44:40.062Z",
|
||||
"passed": false,
|
||||
"duration": 6405,
|
||||
"reason": "Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \\boxed{8}.",
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "anthropic/claude-3.5-sonnet",
|
||||
"timestamp": "2025-04-02T13:44:41.261Z",
|
||||
"passed": true,
|
||||
"duration": 1190,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "qwen/qwq-32b",
|
||||
"timestamp": "2025-04-02T13:44:46.272Z",
|
||||
"passed": true,
|
||||
"duration": 5008,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T13:44:47.386Z",
|
||||
"passed": true,
|
||||
"duration": 1111,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T13:44:48.372Z",
|
||||
"passed": true,
|
||||
"duration": 984,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24\n\n24\n\nThe result is 24.\n\n24\n\nHere's the answer: 24\n\nThe answer will be 24.\n\n24\n\n24\n\nThe product of 8 and 3 is 24.\n\n24\n\nThe answer is 24.\n\n24\n\n24\n\n24\n\nThe result is 24.\n\n24\n\nHere's the numerical result: 24\n\nThe answer is 24.\n\n24\n\n24\n\nThe answer is 24."
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1",
|
||||
"router": "deepseek/deepseek-r1",
|
||||
"timestamp": "2025-04-02T13:44:53.633Z",
|
||||
"passed": false,
|
||||
"duration": 5258,
|
||||
"reason": "Expected 24, but got 24\n\n24\n\nthe result is 24.\n\n24\n\nhere's the answer: 24\n\nthe answer will be 24.\n\n24\n\n24\n\nthe product of 8 and 3 is 24.\n\n24\n\nthe answer is 24.\n\n24\n\n24\n\n24\n\nthe result is 24.\n\n24\n\nhere's the numerical result: 24\n\nthe answer is 24.\n\n24\n\n24\n\nthe answer is 24.",
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T13:44:55.196Z",
|
||||
"passed": true,
|
||||
"duration": 1558,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"router": "anthropic/claude-3.5-sonnet",
|
||||
"timestamp": "2025-04-02T13:44:56.604Z",
|
||||
"passed": true,
|
||||
"duration": 1405,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "qwen/qwq-32b",
|
||||
"router": "qwen/qwq-32b",
|
||||
"timestamp": "2025-04-02T13:44:57.523Z",
|
||||
"passed": true,
|
||||
"duration": 917,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-04-02T13:44:58.630Z",
|
||||
"passed": true,
|
||||
"duration": 1104,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"router": "openai/gpt-3.5-turbo",
|
||||
"timestamp": "2025-04-02T13:44:59.523Z",
|
||||
"passed": true,
|
||||
"duration": 889,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1",
|
||||
"router": "deepseek/deepseek-r1",
|
||||
"timestamp": "2025-04-02T13:45:06.658Z",
|
||||
"passed": true,
|
||||
"duration": 7130,
|
||||
"category": "basic"
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
||||
"timestamp": "2025-04-02T13:45:10.307Z",
|
||||
"passed": true,
|
||||
"duration": 3646,
|
||||
"category": "basic"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -751,13 +1005,13 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1376,
|
||||
"duration_secs": 1.376
|
||||
"duration": 885,
|
||||
"duration_secs": 0.885
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1951,
|
||||
"duration_secs": 1.951
|
||||
"duration": 1689,
|
||||
"duration_secs": 1.689
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -766,13 +1020,13 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 870,
|
||||
"duration_secs": 0.87
|
||||
"duration": 984,
|
||||
"duration_secs": 0.984
|
||||
},
|
||||
{
|
||||
"model": "anthropic/claude-3.5-sonnet",
|
||||
"duration": 1096,
|
||||
"duration_secs": 1.096
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1111,
|
||||
"duration_secs": 1.111
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -781,16 +1035,16 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 794,
|
||||
"duration_secs": 0.794
|
||||
"duration": 889,
|
||||
"duration_secs": 0.889
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 929,
|
||||
"duration_secs": 0.929
|
||||
"model": "qwen/qwq-32b",
|
||||
"duration": 917,
|
||||
"duration_secs": 0.917
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-04-02T11:26:40.358Z"
|
||||
"lastUpdated": "2025-04-02T13:45:10.308Z"
|
||||
}
|
||||
@ -4,36 +4,130 @@
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| division | openai/gpt-3.5-turbo | 794 | 0.79 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 870 | 0.87 |
|
||||
| division | openai/gpt-4o-mini | 929 | 0.93 |
|
||||
| multiplication | anthropic/claude-3.5-sonnet | 1096 | 1.10 |
|
||||
| division | anthropic/claude-3.5-sonnet | 1276 | 1.28 |
|
||||
| addition | openai/gpt-4o-mini | 1376 | 1.38 |
|
||||
| multiplication | openai/gpt-4o-mini | 1444 | 1.44 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 1790 | 1.79 |
|
||||
| addition | anthropic/claude-3.5-sonnet | 1951 | 1.95 |
|
||||
| multiplication | qwen/qwq-32b | 3592 | 3.59 |
|
||||
| addition | qwen/qwq-32b | 3726 | 3.73 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 5646 | 5.65 |
|
||||
| division | qwen/qwq-32b | 5768 | 5.77 |
|
||||
| addition | openai/gpt-3.5-turbo | 7188 | 7.19 |
|
||||
| addition | deepseek/deepseek-r1-distill-qwen-14b:free | 7531 | 7.53 |
|
||||
| multiplication | deepseek/deepseek-r1 | 10983 | 10.98 |
|
||||
| addition | deepseek/deepseek-r1 | 15157 | 15.16 |
|
||||
| division | deepseek/deepseek-r1 | 15409 | 15.41 |
|
||||
| addition | openai/gpt-4o-mini | 885 | 0.89 |
|
||||
| division | openai/gpt-3.5-turbo | 889 | 0.89 |
|
||||
| division | qwen/qwq-32b | 917 | 0.92 |
|
||||
| multiplication | openai/gpt-3.5-turbo | 984 | 0.98 |
|
||||
| division | openai/gpt-4o-mini | 1104 | 1.10 |
|
||||
| multiplication | openai/gpt-4o-mini | 1111 | 1.11 |
|
||||
| multiplication | anthropic/claude-3.5-sonnet | 1190 | 1.19 |
|
||||
| division | anthropic/claude-3.5-sonnet | 1405 | 1.41 |
|
||||
| multiplication | deepseek/deepseek-r1-distill-qwen-14b:free | 1558 | 1.56 |
|
||||
| addition | anthropic/claude-3.5-sonnet | 1689 | 1.69 |
|
||||
| division | deepseek/deepseek-r1-distill-qwen-14b:free | 3646 | 3.65 |
|
||||
| addition | qwen/qwq-32b | 3807 | 3.81 |
|
||||
| multiplication | qwen/qwq-32b | 5008 | 5.01 |
|
||||
| division | deepseek/deepseek-r1 | 7130 | 7.13 |
|
||||
| addition | openai/gpt-3.5-turbo | 10455 | 10.46 |
|
||||
| addition | deepseek/deepseek-r1 | 12064 | 12.06 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 18
|
||||
- Passed: 18
|
||||
- Failed: 0
|
||||
- Success Rate: 100.00%
|
||||
- Average Duration: 4807ms (4.81s)
|
||||
- Passed: 16
|
||||
- Failed: 2
|
||||
- Success Rate: 88.89%
|
||||
- Average Duration: 3639ms (3.64s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
*No failed tests*
|
||||
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `The sum of 5 and 3 is 8. Therefore, the result is \boxed{8}.`
|
||||
- Duration: 6405ms (6405.00s)
|
||||
- Reason: Expected 8, but got the sum of 5 and 3 is 8. therefore, the result is \boxed{8}.
|
||||
- Timestamp: 4/2/2025, 3:44:40 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24
|
||||
|
||||
24
|
||||
|
||||
The result is 24.
|
||||
|
||||
24
|
||||
|
||||
Here's the answer: 24
|
||||
|
||||
The answer will be 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
The product of 8 and 3 is 24.
|
||||
|
||||
24
|
||||
|
||||
The answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
The result is 24.
|
||||
|
||||
24
|
||||
|
||||
Here's the numerical result: 24
|
||||
|
||||
The answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
The answer is 24.`
|
||||
- Duration: 5258ms (5258.00s)
|
||||
- Reason: Expected 24, but got 24
|
||||
|
||||
24
|
||||
|
||||
the result is 24.
|
||||
|
||||
24
|
||||
|
||||
here's the answer: 24
|
||||
|
||||
the answer will be 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
the product of 8 and 3 is 24.
|
||||
|
||||
24
|
||||
|
||||
the answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
the result is 24.
|
||||
|
||||
24
|
||||
|
||||
here's the numerical result: 24
|
||||
|
||||
the answer is 24.
|
||||
|
||||
24
|
||||
|
||||
24
|
||||
|
||||
the answer is 24.
|
||||
- Timestamp: 4/2/2025, 3:44:53 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
@ -42,142 +136,126 @@
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1951ms (1951.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:15 PM
|
||||
- Duration: 1689ms (1689.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:06 PM
|
||||
|
||||
### addition - qwen/qwq-32b
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 3726ms (3726.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:19 PM
|
||||
- Duration: 3807ms (3807.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:10 PM
|
||||
|
||||
### addition - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1376ms (1376.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:20 PM
|
||||
- Duration: 885ms (885.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:11 PM
|
||||
|
||||
### addition - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 7188ms (7188.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:28 PM
|
||||
- Duration: 10455ms (10455.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:21 PM
|
||||
|
||||
### addition - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 15157ms (15157.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:43 PM
|
||||
|
||||
### addition - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 7531ms (7531.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:50 PM
|
||||
- Duration: 12064ms (12064.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:33 PM
|
||||
|
||||
### multiplication - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1096ms (1096.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:51 PM
|
||||
- Duration: 1190ms (1190.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:41 PM
|
||||
|
||||
### multiplication - qwen/qwq-32b
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 3592ms (3592.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:55 PM
|
||||
- Duration: 5008ms (5008.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:46 PM
|
||||
|
||||
### multiplication - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 1444ms (1444.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:56 PM
|
||||
- Duration: 1111ms (1111.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:47 PM
|
||||
|
||||
### multiplication - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 870ms (870.00s)
|
||||
- Timestamp: 4/2/2025, 1:25:57 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 10983ms (10983.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:08 PM
|
||||
- Duration: 984ms (984.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:48 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Duration: 5646ms (5646.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:14 PM
|
||||
- Duration: 1558ms (1558.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:55 PM
|
||||
|
||||
### division - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1276ms (1276.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:15 PM
|
||||
- Duration: 1405ms (1405.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:56 PM
|
||||
|
||||
### division - qwen/qwq-32b
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 5768ms (5768.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:21 PM
|
||||
- Duration: 917ms (917.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:57 PM
|
||||
|
||||
### division - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 929ms (929.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:22 PM
|
||||
- Duration: 1104ms (1104.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:58 PM
|
||||
|
||||
### division - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 794ms (794.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:23 PM
|
||||
- Duration: 889ms (889.00s)
|
||||
- Timestamp: 4/2/2025, 3:44:59 PM
|
||||
|
||||
### division - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 15409ms (15409.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:38 PM
|
||||
- Duration: 7130ms (7130.00s)
|
||||
- Timestamp: 4/2/2025, 3:45:06 PM
|
||||
|
||||
### division - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Duration: 1790ms (1790.00s)
|
||||
- Timestamp: 4/2/2025, 1:26:40 PM
|
||||
- Duration: 3646ms (3646.00s)
|
||||
- Timestamp: 4/2/2025, 3:45:10 PM
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,38 +1,39 @@
|
||||
# Math Operations Test Results
|
||||
|
||||
|
||||
## Highscores
|
||||
|
||||
| Test | Model | Duration (ms) | Duration (s) |
|
||||
|------|-------|--------------|--------------|
|
||||
| power | openai/gpt-3.5-turbo | 719 | 0.72 |
|
||||
| power | openai/gpt-4o-mini | 743 | 0.74 |
|
||||
| square_root | openai/gpt-3.5-turbo | 751 | 0.75 |
|
||||
| square_root | anthropic/claude-3.5-sonnet | 756 | 0.76 |
|
||||
| factorial | openai/gpt-3.5-turbo | 782 | 0.78 |
|
||||
| square_root | openai/gpt-4o-mini | 785 | 0.79 |
|
||||
| fibonacci | openai/gpt-3.5-turbo | 800 | 0.80 |
|
||||
| factorial | openai/gpt-4o-mini | 872 | 0.87 |
|
||||
| quadratic | openai/gpt-3.5-turbo | 878 | 0.88 |
|
||||
| factorial | anthropic/claude-3.5-sonnet | 970 | 0.97 |
|
||||
| fibonacci | openai/gpt-4o-mini | 974 | 0.97 |
|
||||
| quadratic | openai/gpt-4o-mini | 994 | 0.99 |
|
||||
| power | anthropic/claude-3.5-sonnet | 1241 | 1.24 |
|
||||
| quadratic | anthropic/claude-3.5-sonnet | 1650 | 1.65 |
|
||||
| fibonacci | anthropic/claude-3.5-sonnet | 3791 | 3.79 |
|
||||
| factorial | qwen/qwq-32b | 4954 | 4.95 |
|
||||
| power | qwen/qwq-32b | 6255 | 6.25 |
|
||||
| square_root | qwen/qwq-32b | 6745 | 6.75 |
|
||||
| quadratic | qwen/qwq-32b | 10222 | 10.22 |
|
||||
| fibonacci | qwen/qwq-32b | 12322 | 12.32 |
|
||||
| factorial | openai/gpt-3.5-turbo | 827 | 0.83 |
|
||||
| factorial | openai/gpt-4o-mini | 956 | 0.96 |
|
||||
| square_root | openai/gpt-4o-mini | 964 | 0.96 |
|
||||
| square_root | openai/gpt-3.5-turbo | 1080 | 1.08 |
|
||||
| power | anthropic/claude-3.5-sonnet | 1136 | 1.14 |
|
||||
| power | openai/gpt-4o-mini | 1259 | 1.26 |
|
||||
| power | openai/gpt-3.5-turbo | 1498 | 1.50 |
|
||||
| fibonacci | openai/gpt-3.5-turbo | 1543 | 1.54 |
|
||||
| fibonacci | openai/gpt-4o-mini | 1673 | 1.67 |
|
||||
| factorial | anthropic/claude-3.5-sonnet | 1853 | 1.85 |
|
||||
| fibonacci | anthropic/claude-3.5-sonnet | 2004 | 2.00 |
|
||||
| square_root | anthropic/claude-3.5-sonnet | 2012 | 2.01 |
|
||||
| factorial | deepseek/deepseek-r1-distill-qwen-14b:free | 4814 | 4.81 |
|
||||
| power | deepseek/deepseek-r1 | 5414 | 5.41 |
|
||||
| square_root | qwen/qwq-32b | 5888 | 5.89 |
|
||||
| square_root | deepseek/deepseek-r1-distill-qwen-14b:free | 6114 | 6.11 |
|
||||
| quadratic | qwen/qwq-32b | 6795 | 6.79 |
|
||||
| factorial | qwen/qwq-32b | 6892 | 6.89 |
|
||||
| power | qwen/qwq-32b | 7572 | 7.57 |
|
||||
| power | deepseek/deepseek-r1-distill-qwen-14b:free | 9891 | 9.89 |
|
||||
| square_root | deepseek/deepseek-r1 | 10309 | 10.31 |
|
||||
| factorial | deepseek/deepseek-r1 | 11193 | 11.19 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 20
|
||||
- Passed: 14
|
||||
- Failed: 6
|
||||
- Success Rate: 70.00%
|
||||
- Average Duration: 2860ms (2.86s)
|
||||
- Total Tests: 29
|
||||
- Passed: 22
|
||||
- Failed: 7
|
||||
- Success Rate: 75.86%
|
||||
- Average Duration: 4745ms (4.75s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
@ -41,54 +42,67 @@
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2,-3`
|
||||
- Duration: 1650ms (1650.00s)
|
||||
- Duration: 1892ms (1892.00s)
|
||||
- Reason: Expected -3,-2, but got -2,-3
|
||||
- Timestamp: 4/2/2025, 1:22:10 PM
|
||||
- Timestamp: 4/2/2025, 3:32:51 PM
|
||||
|
||||
### quadratic - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2, -3`
|
||||
- Duration: 994ms (994.00s)
|
||||
- Duration: 853ms (853.00s)
|
||||
- Reason: Expected -3,-2, but got -2, -3
|
||||
- Timestamp: 4/2/2025, 1:22:21 PM
|
||||
- Timestamp: 4/2/2025, 3:32:59 PM
|
||||
|
||||
### quadratic - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2, -3`
|
||||
- Duration: 878ms (878.00s)
|
||||
- Duration: 832ms (832.00s)
|
||||
- Reason: Expected -3,-2, but got -2, -3
|
||||
- Timestamp: 4/2/2025, 1:22:22 PM
|
||||
- Timestamp: 4/2/2025, 3:32:59 PM
|
||||
|
||||
### quadratic - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-2, -3`
|
||||
- Duration: 19850ms (19850.00s)
|
||||
- Reason: Expected -3,-2, but got -2, -3
|
||||
- Timestamp: 4/2/2025, 3:33:19 PM
|
||||
|
||||
### quadratic - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `The solutions to the quadratic equation x² + 5x + 6 = 0 are x = -2 and x = -3.
|
||||
|
||||
-2,-3`
|
||||
- Duration: 15811ms (15811.00s)
|
||||
- Reason: Expected -3,-2, but got the solutions to the quadratic equation x² + 5x + 6 = 0 are x = -2 and x = -3.
|
||||
|
||||
-2,-3
|
||||
- Timestamp: 4/2/2025, 3:33:35 PM
|
||||
|
||||
### fibonacci - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `5`
|
||||
- Duration: 12322ms (12322.00s)
|
||||
- Duration: 1509ms (1509.00s)
|
||||
- Reason: Expected 8, but got 5
|
||||
- Timestamp: 4/2/2025, 1:22:46 PM
|
||||
- Timestamp: 4/2/2025, 3:34:05 PM
|
||||
|
||||
### fibonacci - openai/gpt-4o-mini
|
||||
### fibonacci - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `5`
|
||||
- Duration: 974ms (974.00s)
|
||||
- Duration: 5171ms (5171.00s)
|
||||
- Reason: Expected 8, but got 5
|
||||
- Timestamp: 4/2/2025, 1:22:47 PM
|
||||
|
||||
### power - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: ``
|
||||
- Duration: 6255ms (6255.00s)
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/2/2025, 1:23:04 PM
|
||||
- Timestamp: 4/2/2025, 3:34:44 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
@ -97,110 +111,174 @@
|
||||
- Prompt: `Solve the quadratic equation x² + 5x + 6 = 0. Return only the solutions as comma-separated numbers, no explanation.`
|
||||
- Expected: `-3,-2`
|
||||
- Actual: `-3,-2`
|
||||
- Duration: 10222ms (10222.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:20 PM
|
||||
- Duration: 6795ms (6795.00s)
|
||||
- Timestamp: 4/2/2025, 3:32:58 PM
|
||||
|
||||
### factorial - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 970ms (970.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:23 PM
|
||||
- Duration: 1853ms (1853.00s)
|
||||
- Timestamp: 4/2/2025, 3:33:37 PM
|
||||
|
||||
### factorial - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 4954ms (4954.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:28 PM
|
||||
- Duration: 6892ms (6892.00s)
|
||||
- Timestamp: 4/2/2025, 3:33:44 PM
|
||||
|
||||
### factorial - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 872ms (872.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:29 PM
|
||||
- Duration: 956ms (956.00s)
|
||||
- Timestamp: 4/2/2025, 3:33:45 PM
|
||||
|
||||
### factorial - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 782ms (782.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:30 PM
|
||||
- Duration: 827ms (827.00s)
|
||||
- Timestamp: 4/2/2025, 3:33:46 PM
|
||||
|
||||
### factorial - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 11193ms (11193.00s)
|
||||
- Timestamp: 4/2/2025, 3:33:57 PM
|
||||
|
||||
### factorial - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate 5! (factorial of 5). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Duration: 4814ms (4814.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:02 PM
|
||||
|
||||
### fibonacci - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 3791ms (3791.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:33 PM
|
||||
- Duration: 2004ms (2004.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:04 PM
|
||||
|
||||
### fibonacci - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1673ms (1673.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:07 PM
|
||||
|
||||
### fibonacci - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate the 6th number in the Fibonacci sequence. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 800ms (800.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:48 PM
|
||||
- Duration: 1543ms (1543.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:08 PM
|
||||
|
||||
### square_root - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 756ms (756.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:48 PM
|
||||
- Duration: 2012ms (2012.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:46 PM
|
||||
|
||||
### square_root - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 6745ms (6745.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:55 PM
|
||||
- Duration: 5888ms (5888.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:52 PM
|
||||
|
||||
### square_root - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 785ms (785.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:56 PM
|
||||
- Duration: 964ms (964.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:52 PM
|
||||
|
||||
### square_root - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 751ms (751.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:57 PM
|
||||
- Duration: 1080ms (1080.00s)
|
||||
- Timestamp: 4/2/2025, 3:34:54 PM
|
||||
|
||||
### square_root - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 10309ms (10309.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:04 PM
|
||||
|
||||
### square_root - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate the square root of 16. Return only the number, no explanation.`
|
||||
- Expected: `4`
|
||||
- Actual: `4`
|
||||
- Duration: 6114ms (6114.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:10 PM
|
||||
|
||||
### power - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 1241ms (1241.00s)
|
||||
- Timestamp: 4/2/2025, 1:22:58 PM
|
||||
- Duration: 1136ms (1136.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:11 PM
|
||||
|
||||
### power - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 7572ms (7572.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:19 PM
|
||||
|
||||
### power - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 743ms (743.00s)
|
||||
- Timestamp: 4/2/2025, 1:23:05 PM
|
||||
- Duration: 1259ms (1259.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:20 PM
|
||||
|
||||
### power - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 719ms (719.00s)
|
||||
- Timestamp: 4/2/2025, 1:23:06 PM
|
||||
- Duration: 1498ms (1498.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:21 PM
|
||||
|
||||
### power - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 5414ms (5414.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:27 PM
|
||||
|
||||
### power - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Calculate 2 raised to the power of 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Duration: 9891ms (9891.00s)
|
||||
- Timestamp: 4/2/2025, 3:35:37 PM
|
||||
|
||||
|
||||
3086
packages/kbot/tests/unit/reports/seo.json
Normal file
3086
packages/kbot/tests/unit/reports/seo.json
Normal file
File diff suppressed because it is too large
Load Diff
124
packages/kbot/tests/unit/reports/seo.md
Normal file
124
packages/kbot/tests/unit/reports/seo.md
Normal file
@ -0,0 +1,124 @@
|
||||
# SEO Keyword Generation Test Results
|
||||
|
||||
## Model Rankings
|
||||
|
||||
| Model | Avg Matches | Total Matches | Tests Run |
|
||||
|-------|-------------|---------------|-----------|
|
||||
| anthropic/claude-3.5-sonnet | 3.50 | 7/10 | 2 |
|
||||
| openai/gpt-4o-mini | 3.50 | 7/10 | 2 |
|
||||
| openai/gpt-3.5-turbo | 3.50 | 7/10 | 2 |
|
||||
|
||||
## Summary
|
||||
|
||||
- Total Tests: 12
|
||||
- Passed: 6
|
||||
- Failed: 6
|
||||
- Success Rate: 50.00%
|
||||
- Average Duration: 3222ms (3.22s)
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### seo_keywords_text - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Generate SEO keywords for descriptive text`
|
||||
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
|
||||
- Actual: ``
|
||||
- Duration: 5000ms (5.00s)
|
||||
- Reason: API call timed out
|
||||
- Timestamp: 4/2/2025, 4:25:03 PM
|
||||
|
||||
### seo_keywords_text - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `Generate SEO keywords for descriptive text`
|
||||
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
|
||||
- Actual: ``
|
||||
- Duration: 5000ms (5.00s)
|
||||
- Reason: API call timed out
|
||||
- Timestamp: 4/2/2025, 4:25:10 PM
|
||||
|
||||
### seo_keywords_text - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Generate SEO keywords for descriptive text`
|
||||
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
|
||||
- Actual: ``
|
||||
- Duration: 5000ms (5.00s)
|
||||
- Reason: API call timed out
|
||||
- Timestamp: 4/2/2025, 4:25:15 PM
|
||||
|
||||
### seo_keywords_technical - qwen/qwq-32b
|
||||
|
||||
- Prompt: `Generate SEO keywords for technical text`
|
||||
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
|
||||
- Actual: ``
|
||||
- Duration: 5000ms (5.00s)
|
||||
- Reason: API call timed out
|
||||
- Timestamp: 4/2/2025, 4:25:22 PM
|
||||
|
||||
### seo_keywords_technical - deepseek/deepseek-r1
|
||||
|
||||
- Prompt: `Generate SEO keywords for technical text`
|
||||
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
|
||||
- Actual: ``
|
||||
- Duration: 5000ms (5.00s)
|
||||
- Reason: API call timed out
|
||||
- Timestamp: 4/2/2025, 4:25:30 PM
|
||||
|
||||
### seo_keywords_technical - deepseek/deepseek-r1-distill-qwen-14b:free
|
||||
|
||||
- Prompt: `Generate SEO keywords for technical text`
|
||||
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
|
||||
- Actual: ``
|
||||
- Duration: 5000ms (5.00s)
|
||||
- Reason: API call timed out
|
||||
- Timestamp: 4/2/2025, 4:25:35 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### seo_keywords_text - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.`
|
||||
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
|
||||
- Actual: `ancient Rome, Colosseum tourism, Roman Forum attractions, Vatican City sightseeing, historic Rome landmarks`
|
||||
- Duration: 1719ms (1.72s)
|
||||
- Timestamp: 4/2/2025, 4:24:58 PM
|
||||
|
||||
### seo_keywords_text - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.`
|
||||
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
|
||||
- Actual: `ancient Rome, Colosseum, Roman Forum, Vatican City, history art culture`
|
||||
- Duration: 1539ms (1.54s)
|
||||
- Timestamp: 4/2/2025, 4:25:04 PM
|
||||
|
||||
### seo_keywords_text - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.`
|
||||
- Expected: `Rome, Colosseum, Roman Forum, Vatican City, ancient history`
|
||||
- Actual: `ancient city of Rome, Colosseum, Roman Forum, Vatican City, history, art, culture`
|
||||
- Duration: 1002ms (1.00s)
|
||||
- Timestamp: 4/2/2025, 4:25:05 PM
|
||||
|
||||
### seo_keywords_technical - anthropic/claude-3.5-sonnet
|
||||
|
||||
- Prompt: `Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.`
|
||||
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
|
||||
- Actual: `machine learning algorithms, statistical learning, data science, automated learning, predictive modeling`
|
||||
- Duration: 1830ms (1.83s)
|
||||
- Timestamp: 4/2/2025, 4:25:17 PM
|
||||
|
||||
### seo_keywords_technical - openai/gpt-4o-mini
|
||||
|
||||
- Prompt: `Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.`
|
||||
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
|
||||
- Actual: `machine learning algorithms, statistical methods, computer learning, data improvement, performance enhancement`
|
||||
- Duration: 1694ms (1.69s)
|
||||
- Timestamp: 4/2/2025, 4:25:24 PM
|
||||
|
||||
### seo_keywords_technical - openai/gpt-3.5-turbo
|
||||
|
||||
- Prompt: `Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.`
|
||||
- Expected: `machine learning, algorithms, artificial intelligence, data science, statistical methods`
|
||||
- Actual: `Machine learning algorithms, statistical methods, computers, data, performance.`
|
||||
- Duration: 881ms (0.88s)
|
||||
- Timestamp: 4/2/2025, 4:25:25 PM
|
||||
|
||||
245
packages/kbot/tests/unit/seo.test.ts
Normal file
245
packages/kbot/tests/unit/seo.test.ts
Normal file
@ -0,0 +1,245 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import * as path from 'node:path'
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import {
|
||||
getDefaultModels,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
runTest,
|
||||
getReportPaths
|
||||
} from './commons'
|
||||
|
||||
// Optionally override models for this specific test file
|
||||
const models = getDefaultModels()
|
||||
|
||||
describe('SEO Keyword Generation', () => {
|
||||
let testResults: TestResult[] = []
|
||||
const TEST_LOG_PATH = getReportPaths('seo', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('seo', 'md')
|
||||
|
||||
// Track model performance
|
||||
const modelScores: Record<string, { total: number, tests: number }> = {}
|
||||
|
||||
it.each(models)('should generate SEO keywords for a descriptive text with model %s', async (modelName) => {
|
||||
try {
|
||||
const result = await runTest(
|
||||
'Generate 5 SEO keywords for this text: "The ancient city of Rome, with its magnificent Colosseum, historic Roman Forum, and stunning Vatican City, offers visitors a unique blend of history, art, and culture." Return only the keywords separated by commas, no explanation.',
|
||||
'Rome, Colosseum, Roman Forum, Vatican City, ancient history',
|
||||
'seo_keywords_text',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
|
||||
// Handle potential empty or invalid results
|
||||
if (!result.result?.[0]) {
|
||||
result.passed = false
|
||||
result.reason = 'No keywords generated'
|
||||
return
|
||||
}
|
||||
|
||||
const resultKeywords = result.result[0]
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.split(',')
|
||||
.map(k => k.trim())
|
||||
.filter(k => k.length > 0)
|
||||
|
||||
const expectedKeywords = ['rome', 'colosseum', 'roman forum', 'vatican city', 'ancient history']
|
||||
const matchedKeywords = expectedKeywords.filter(keyword =>
|
||||
resultKeywords.some(result => result.includes(keyword))
|
||||
)
|
||||
|
||||
// Update model score
|
||||
if (!modelScores[modelName]) {
|
||||
modelScores[modelName] = { total: 0, tests: 0 }
|
||||
}
|
||||
modelScores[modelName].total += matchedKeywords.length
|
||||
modelScores[modelName].tests += 1
|
||||
|
||||
// Log the actual results for debugging
|
||||
console.log(`Model ${modelName} generated keywords:`, resultKeywords)
|
||||
console.log(`Matched keywords:`, matchedKeywords)
|
||||
console.log(`Current score for ${modelName}: ${matchedKeywords.length}/${expectedKeywords.length}`)
|
||||
|
||||
// Update test result to reflect passing if we have at least 2 matches
|
||||
result.passed = matchedKeywords.length >= 2
|
||||
result.reason = result.passed ? undefined : `Only matched ${matchedKeywords.length} keywords, expected at least 2`
|
||||
} catch (error) {
|
||||
// Handle timeout or other errors gracefully
|
||||
const result: TestResult = {
|
||||
test: 'seo_keywords_text',
|
||||
prompt: 'Generate SEO keywords for descriptive text',
|
||||
result: [],
|
||||
expected: 'Rome, Colosseum, Roman Forum, Vatican City, ancient history',
|
||||
model: modelName,
|
||||
router: 'openrouter',
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: TEST_TIMEOUT,
|
||||
reason: error instanceof Error ? error.message : 'Unknown error occurred'
|
||||
}
|
||||
testResults.push(result)
|
||||
}
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should generate SEO keywords for a technical text with model %s', async (modelName) => {
|
||||
try {
|
||||
const result = await runTest(
|
||||
'Generate 5 SEO keywords for this text: "Machine learning algorithms use statistical methods to enable computers to learn from data and improve their performance over time without being explicitly programmed." Return only the keywords separated by commas, no explanation.',
|
||||
'machine learning, algorithms, artificial intelligence, data science, statistical methods',
|
||||
'seo_keywords_technical',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
|
||||
// Handle potential empty or invalid results
|
||||
if (!result.result?.[0]) {
|
||||
result.passed = false
|
||||
result.reason = 'No keywords generated'
|
||||
return
|
||||
}
|
||||
|
||||
const resultKeywords = result.result[0]
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.split(',')
|
||||
.map(k => k.trim())
|
||||
.filter(k => k.length > 0)
|
||||
|
||||
const expectedKeywords = ['machine learning', 'algorithms', 'artificial intelligence', 'data science', 'statistical methods']
|
||||
const matchedKeywords = expectedKeywords.filter(keyword =>
|
||||
resultKeywords.some(result => result.includes(keyword))
|
||||
)
|
||||
|
||||
// Update model score
|
||||
if (!modelScores[modelName]) {
|
||||
modelScores[modelName] = { total: 0, tests: 0 }
|
||||
}
|
||||
modelScores[modelName].total += matchedKeywords.length
|
||||
modelScores[modelName].tests += 1
|
||||
|
||||
// Log the actual results for debugging
|
||||
console.log(`Model ${modelName} generated keywords:`, resultKeywords)
|
||||
console.log(`Matched keywords:`, matchedKeywords)
|
||||
console.log(`Current score for ${modelName}: ${matchedKeywords.length}/${expectedKeywords.length}`)
|
||||
|
||||
// Update test result to reflect passing if we have at least 2 matches
|
||||
result.passed = matchedKeywords.length >= 2
|
||||
result.reason = result.passed ? undefined : `Only matched ${matchedKeywords.length} keywords, expected at least 2`
|
||||
} catch (error) {
|
||||
// Handle timeout or other errors gracefully
|
||||
const result: TestResult = {
|
||||
test: 'seo_keywords_technical',
|
||||
prompt: 'Generate SEO keywords for technical text',
|
||||
result: [],
|
||||
expected: 'machine learning, algorithms, artificial intelligence, data science, statistical methods',
|
||||
model: modelName,
|
||||
router: 'openrouter',
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: TEST_TIMEOUT,
|
||||
reason: error instanceof Error ? error.message : 'Unknown error occurred'
|
||||
}
|
||||
testResults.push(result)
|
||||
}
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it('should generate markdown report with model rankings', () => {
|
||||
// Calculate average scores and sort models
|
||||
const modelRankings = Object.entries(modelScores)
|
||||
.map(([model, scores]) => ({
|
||||
model,
|
||||
averageScore: scores.total / scores.tests,
|
||||
totalMatches: scores.total,
|
||||
testsRun: scores.tests,
|
||||
maxPossibleMatches: scores.tests * 5 // Each test has 5 expected keywords
|
||||
}))
|
||||
.sort((a, b) => b.averageScore - a.averageScore)
|
||||
|
||||
// Log the rankings
|
||||
console.log('\nSEO Keyword Generation Rankings:')
|
||||
console.log('--------------------------------')
|
||||
modelRankings.forEach((ranking, index) => {
|
||||
console.log(`${index + 1}. ${ranking.model}:`)
|
||||
console.log(` Average matches per test: ${ranking.averageScore.toFixed(2)}`)
|
||||
console.log(` Total matches: ${ranking.totalMatches}/${ranking.maxPossibleMatches}`)
|
||||
console.log(` Tests run: ${ranking.testsRun}`)
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = `# SEO Keyword Generation Test Results\n\n`
|
||||
|
||||
// Add SEO rankings section
|
||||
report += '## Model Rankings\n\n'
|
||||
report += '| Model | Avg Matches | Total Matches | Tests Run |\n'
|
||||
report += '|-------|-------------|---------------|-----------|'
|
||||
|
||||
modelRankings.forEach(({ model, averageScore, totalMatches, testsRun, maxPossibleMatches }) => {
|
||||
report += `\n| ${model} | ${averageScore.toFixed(2)} | ${totalMatches}/${maxPossibleMatches} | ${testsRun} |`
|
||||
})
|
||||
report += '\n\n'
|
||||
|
||||
// Add summary section
|
||||
const totalTests = testResults.length
|
||||
const passedTests = testResults.filter(r => r.passed).length
|
||||
const failedTests = totalTests - passedTests
|
||||
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
|
||||
|
||||
report += '## Summary\n\n'
|
||||
report += `- Total Tests: ${totalTests}\n`
|
||||
report += `- Passed: ${passedTests}\n`
|
||||
report += `- Failed: ${failedTests}\n`
|
||||
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
|
||||
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
|
||||
|
||||
// Add failed tests section
|
||||
report += '## Failed Tests\n\n'
|
||||
const failedResults = testResults.filter(r => !r.passed)
|
||||
|
||||
if (failedResults.length === 0) {
|
||||
report += '*No failed tests*\n\n'
|
||||
} else {
|
||||
failedResults.forEach(result => {
|
||||
report += `### ${result.test} - ${result.model}\n\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
||||
if (result.error) {
|
||||
report += `- Error: ${result.error.message}\n`
|
||||
}
|
||||
if (result.reason) {
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
}
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
})
|
||||
}
|
||||
|
||||
// Add passed tests section
|
||||
report += '## Passed Tests\n\n'
|
||||
const passedResults = testResults.filter(r => r.passed)
|
||||
|
||||
if (passedResults.length === 0) {
|
||||
report += '*No passed tests*\n\n'
|
||||
} else {
|
||||
passedResults.forEach(result => {
|
||||
report += `### ${result.test} - ${result.model}\n\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
})
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
write(TEST_REPORT_PATH, report)
|
||||
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
||||
})
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user