import * as path from 'node:path' import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL, E_Mode } from '../../src/index' import { run } from '../../src/index' import { sync as write } from "@polymech/fs/write" import { sync as read } from "@polymech/fs/read" import { sync as exists } from "@polymech/fs/exists" import { sync as mkdirp } from "mkdirp" export enum ModelCategory { FAST = 'fast', LANGUAGE = 'language', TOOL = 'tool', ALL = 'all' } export const getFastModels = (): string[] => { return [ E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO, E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE, E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI, E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA ] } export const getLanguageModels = (): string[] => { return [ E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET, E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B, E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI, E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO ] } export const getToolModels = (): string[] => { return [ E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O ] } export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST): string[] => { switch (category) { case ModelCategory.FAST: return getFastModels() case ModelCategory.LANGUAGE: return getLanguageModels() case ModelCategory.TOOL: return getToolModels() case ModelCategory.ALL: default: return [ E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET, E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B, E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI, E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO, E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1, E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE ] } } export const isOpenRouterModel = (model: string): boolean => { return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL) } export const TEST_BASE_PATH = path.resolve(__dirname, '../../') export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs') export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md') export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls // Report paths configuration export const REPORTS_DIR = path.resolve(__dirname, './reports') // Ensure reports directory exists if (exists(REPORTS_DIR) !== 'directory') { mkdirp(REPORTS_DIR) } export const getReportPaths = (category: string, type: 'json' | 'md'): string => { const base = path.resolve(REPORTS_DIR, category) return `${base}.${type}` } export interface TestResult { test: string; prompt: string; result: string[]; expected: string; model: string; router: string; timestamp: string; passed: boolean; reason?: string; error?: { message: string; code?: string; type?: string; details?: any; }; duration?: number category?: string; } export interface TestHighscore { test: string; rankings: { model: string; duration: number; duration_secs: number; }[]; } export const formatError = (error: any): TestResult['error'] => { return { message: error?.message || 'Unknown error', code: error?.code || 'UNKNOWN', type: error?.type || error?.constructor?.name || 'Error', details: error?.response?.data || error?.response || error } } export const isEmptyResponse = (result: string[] | null | undefined): boolean => { return !result || result.length === 0 || result.every(r => !r || r.trim() === '') } export const generateHighscores = (latestResults: Map>): TestHighscore[] => { const highscores: TestHighscore[] = [] for (const [testName, modelResults] of latestResults) { // Convert model results to array and sort by duration const sortedResults = Array.from(modelResults.entries()) .map(([model, result]) => ({ model, result })) .sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0)) .slice(0, 2) // Get top 2 if (sortedResults.length > 0) { highscores.push({ test: testName, rankings: sortedResults.map(({ model, result }) => ({ model, duration: result.duration || 0, duration_secs: (result.duration || 0) / 1000 })) }) } } return highscores } export const runTest = async ( prompt: string, expected: string, testName: string, modelName: string, logPath: string, mode: "completion" | "tools" | "assistant" | "custom" = "completion", options: any = {} ): Promise => { let model = 'unknown' let router = 'openrouter' let startTime = Date.now() let error: TestResult['error'] | undefined let testResult: TestResult | undefined try { const result = await Promise.race([ run({ prompt, mode, model: modelName, path: TEST_BASE_PATH, logs: TEST_LOGS_PATH, preferences: TEST_PREFERENCES_PATH, logLevel: 2, ...options, onRun: async (options) => { model = options.model || 'unknown' router = options.model as string return options } }), new Promise((_, reject) => setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT) ) ]) as string[] if (isEmptyResponse(result)) { testResult = { test: testName, prompt, result: [], expected, model, router, timestamp: new Date().toISOString(), passed: false, duration: Date.now() - startTime, reason: 'Model returned empty response' } } else { const actual = result?.[0]?.trim()?.toLowerCase() || '' const passed = actual === expected testResult = { test: testName, prompt, result: result || [], expected, model, router, timestamp: new Date().toISOString(), passed, duration: Date.now() - startTime, reason: passed ? undefined : `Expected ${expected}, but got ${actual}`, } } } catch (e) { error = formatError(e) testResult = { test: testName, prompt, result: [], expected, model, router, timestamp: new Date().toISOString(), passed: false, duration: Date.now() - startTime, error, reason: error?.message || 'Unknown error occurred' } throw e } finally { if (testResult) { // Extract category from logPath (e.g., 'reports/basic.json' -> 'basic') const category = path.basename(logPath, path.extname(logPath)) // Add category to test result testResult.category = category // Update category-specific log const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] } const updatedResults = [...(existingData.results || []), testResult] // Group results by test and model const latestResults = new Map>() updatedResults.forEach(result => { if (!latestResults.has(result.test)) { latestResults.set(result.test, new Map()) } const testMap = latestResults.get(result.test)! const existingResult = testMap.get(result.model) if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { testMap.set(result.model, result) } }) // Generate highscores const highscores = generateHighscores(latestResults) // Write category-specific results write(logPath, JSON.stringify({ results: updatedResults, highscores, lastUpdated: new Date().toISOString() }, null, 2)) // Update central all.json log const allLogPath = path.resolve(REPORTS_DIR, 'all.json') const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] } const allUpdatedResults = [...(allExistingData.results || []), testResult] // Group all results by test and model const allLatestResults = new Map>() allUpdatedResults.forEach(result => { if (!allLatestResults.has(result.test)) { allLatestResults.set(result.test, new Map()) } const testMap = allLatestResults.get(result.test)! const existingResult = testMap.get(result.model) if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { testMap.set(result.model, result) } }) // Generate highscores for all results const allHighscores = generateHighscores(allLatestResults) // Write all results write(allLogPath, JSON.stringify({ results: allUpdatedResults, highscores: allHighscores, lastUpdated: new Date().toISOString() }, null, 2)) } } return testResult } export const generateTestReport = ( testResults: TestResult[], reportTitle: string, reportPath: string ): void => { // Group results by test and model const latestResults = new Map>() // Get only the latest result for each test+model combination testResults.forEach(result => { if (!latestResults.has(result.test)) { latestResults.set(result.test, new Map()) } const testMap = latestResults.get(result.test)! const existingResult = testMap.get(result.model) if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) { testMap.set(result.model, result) } }) // Generate markdown report let report = `# ${reportTitle}\n\n` // Add highscore section report += '## Highscores\n\n' // Add regular test rankings report += '### Performance Rankings (Duration)\n\n' report += '| Test | Model | Duration (ms) | Duration (s) |\n' report += '|------|-------|--------------|--------------|\n' Array.from(latestResults.entries()).forEach(([testName, modelResults]) => { const sortedResults = Array.from(modelResults.entries()) .map(([model, result]) => ({ model, duration: result.duration || 0 })) .sort((a, b) => a.duration - b.duration) sortedResults.forEach(({ model, duration }) => { report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n` }) }) report += '\n' // Add summary section report += '## Summary\n\n' const totalTests = testResults.length const passedTests = testResults.filter(r => r.passed).length const failedTests = totalTests - passedTests const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests report += `- Total Tests: ${totalTests}\n` report += `- Passed: ${passedTests}\n` report += `- Failed: ${failedTests}\n` report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n` report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n` // First list failed tests report += '## Failed Tests\n\n' let hasFailures = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (!result.passed) { hasFailures = true report += `### ${testName} - ${model}\n\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n` if (result.error) { report += `- Error Type: ${result.error.type}\n` report += `- Error Code: ${result.error.code}\n` report += `- Error Message: ${result.error.message}\n` if (result.error.details?.message) { report += `- Error Details: ${result.error.details.message}\n` } } report += `- Reason: ${result.reason}\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } if (!hasFailures) { report += '*No failed tests*\n\n' } // Then list passed tests report += '## Passed Tests\n\n' let hasPassed = false for (const [testName, modelResults] of latestResults) { for (const [model, result] of modelResults) { if (result.passed) { hasPassed = true report += `### ${testName} - ${model}\n\n` report += `- Prompt: \`${result.prompt}\`\n` report += `- Expected: \`${result.expected}\`\n` report += `- Actual: \`${result.result[0] || ''}\`\n` report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n` report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n` } } } if (!hasPassed) { report += '*No passed tests*\n\n' } // Write report to file write(reportPath, report) }