mono/packages/kbot/tests/unit/commons.ts
2025-04-02 21:29:46 +02:00

409 lines
13 KiB
TypeScript

import * as path from 'node:path'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL, E_Mode } from '../../src/index'
import { run } from '../../src/index'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { sync as mkdirp } from "mkdirp"
export enum ModelCategory {
FAST = 'fast',
LANGUAGE = 'language',
TOOL = 'tool',
ALL = 'all'
}
export const getFastModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI
]
}
export const getLanguageModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO
]
}
export const getToolModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
]
}
export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST): string[] => {
switch (category) {
case ModelCategory.FAST:
return getFastModels()
case ModelCategory.LANGUAGE:
return getLanguageModels()
case ModelCategory.TOOL:
return getToolModels()
case ModelCategory.ALL:
default:
return [
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE
]
}
}
export const isOpenRouterModel = (model: string): boolean => {
return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL)
}
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
// Report paths configuration
export const REPORTS_DIR = path.resolve(__dirname, './reports')
// Ensure reports directory exists
if (exists(REPORTS_DIR) !== 'directory') {
mkdirp(REPORTS_DIR)
}
export const getReportPaths = (category: string, type: 'json' | 'md'): string => {
const base = path.resolve(REPORTS_DIR, category)
return `${base}.${type}`
}
export interface TestResult {
test: string;
prompt: string;
result: string[];
expected: string;
model: string;
router: string;
timestamp: string;
passed: boolean;
reason?: string;
error?: {
message: string;
code?: string;
type?: string;
details?: any;
};
duration?: number
category?: string;
}
export interface TestHighscore {
test: string;
rankings: {
model: string;
duration: number;
duration_secs: number;
}[];
}
export const formatError = (error: any): TestResult['error'] => {
return {
message: error?.message || 'Unknown error',
code: error?.code || 'UNKNOWN',
type: error?.type || error?.constructor?.name || 'Error',
details: error?.response?.data || error?.response || error
}
}
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
}
export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
const highscores: TestHighscore[] = []
for (const [testName, modelResults] of latestResults) {
// Convert model results to array and sort by duration
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({ model, result }))
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
.slice(0, 2) // Get top 2
if (sortedResults.length > 0) {
highscores.push({
test: testName,
rankings: sortedResults.map(({ model, result }) => ({
model,
duration: result.duration || 0,
duration_secs: (result.duration || 0) / 1000
}))
})
}
}
return highscores
}
export const runTest = async (
prompt: string,
expected: string,
testName: string,
modelName: string,
logPath: string,
mode: "completion" | "tools" | "assistant" | "custom" = "completion",
options: any = {}
): Promise<TestResult> => {
let model = 'unknown'
let router = 'openrouter'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
try {
const result = await Promise.race([
run({
prompt,
mode,
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
logLevel: 2,
...options,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.model as string
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
if (isEmptyResponse(result)) {
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
reason: 'Model returned empty response'
}
} else {
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
testResult = {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
}
}
} catch (e) {
error = formatError(e)
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred'
}
throw e
} finally {
if (testResult) {
// Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
const category = path.basename(logPath, path.extname(logPath))
// Add category to test result
testResult.category = category
// Update category-specific log
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
const updatedResults = [...(existingData.results || []), testResult]
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
updatedResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate highscores
const highscores = generateHighscores(latestResults)
// Write category-specific results
write(logPath, JSON.stringify({
results: updatedResults,
highscores,
lastUpdated: new Date().toISOString()
}, null, 2))
// Update central all.json log
const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
const allUpdatedResults = [...(allExistingData.results || []), testResult]
// Group all results by test and model
const allLatestResults = new Map<string, Map<string, TestResult>>()
allUpdatedResults.forEach(result => {
if (!allLatestResults.has(result.test)) {
allLatestResults.set(result.test, new Map())
}
const testMap = allLatestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate highscores for all results
const allHighscores = generateHighscores(allLatestResults)
// Write all results
write(allLogPath, JSON.stringify({
results: allUpdatedResults,
highscores: allHighscores,
lastUpdated: new Date().toISOString()
}, null, 2))
}
}
return testResult
}
export const generateTestReport = (
testResults: TestResult[],
reportTitle: string,
reportPath: string
): void => {
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
// Get only the latest result for each test+model combination
testResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate markdown report
let report = `# ${reportTitle}\n\n`
// Add highscore section
report += '## Highscores\n\n'
// Add regular test rankings
report += '### Performance Rankings (Duration)\n\n'
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
report += '|------|-------|--------------|--------------|\n'
Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({
model,
duration: result.duration || 0
}))
.sort((a, b) => a.duration - b.duration)
sortedResults.forEach(({ model, duration }) => {
report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
})
})
report += '\n'
// Add summary section
report += '## Summary\n\n'
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
if (result.error.details?.message) {
report += `- Error Details: ${result.error.details.message}\n`
}
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
write(reportPath, report)
}