432 lines
13 KiB
TypeScript
432 lines
13 KiB
TypeScript
import * as path from 'node:path'
|
|
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL, E_Mode } from '../../src/index'
|
|
import { run } from '../../src/index'
|
|
import { sync as write } from "@polymech/fs/write"
|
|
import { sync as read } from "@polymech/fs/read"
|
|
import { sync as exists } from "@polymech/fs/exists"
|
|
import { sync as mkdirp } from "mkdirp"
|
|
|
|
export enum ModelCategory {
|
|
FAST = 'fast',
|
|
LANGUAGE = 'language',
|
|
TOOL = 'tool',
|
|
ALL = 'all',
|
|
CODING = 'coding',
|
|
FILES = 'file'
|
|
}
|
|
|
|
export const getFastModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
|
|
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
|
E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA
|
|
]
|
|
}
|
|
|
|
export const getCodingModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA
|
|
]
|
|
}
|
|
|
|
export const getFileModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
|
E_OPENROUTER_MODEL.MODEL_OPENROUTER_QUASAR_ALPHA,
|
|
E_OPENROUTER_MODEL.MODEL_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE
|
|
]
|
|
}
|
|
|
|
export const getLanguageModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
|
|
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO
|
|
]
|
|
}
|
|
|
|
export const getToolModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
|
|
]
|
|
}
|
|
|
|
export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST): string[] => {
|
|
switch (category) {
|
|
case ModelCategory.FAST:
|
|
return getFastModels()
|
|
case ModelCategory.LANGUAGE:
|
|
return getLanguageModels()
|
|
case ModelCategory.TOOL:
|
|
return getToolModels()
|
|
case ModelCategory.CODING:
|
|
return getCodingModels()
|
|
case ModelCategory.FILES:
|
|
return getFileModels()
|
|
case ModelCategory.ALL:
|
|
default:
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_5_SONNET,
|
|
E_OPENROUTER_MODEL.MODEL_QWEN_QWQ_32B,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_3_5_TURBO,
|
|
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1,
|
|
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_DISTILL_QWEN_14B_FREE
|
|
]
|
|
}
|
|
}
|
|
|
|
export const isOpenRouterModel = (model: string): boolean => {
|
|
return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL)
|
|
}
|
|
|
|
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
|
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
|
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
|
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
|
|
|
|
// Report paths configuration
|
|
export const REPORTS_DIR = path.resolve(__dirname, './reports')
|
|
|
|
// Ensure reports directory exists
|
|
if (exists(REPORTS_DIR) !== 'directory') {
|
|
mkdirp(REPORTS_DIR)
|
|
}
|
|
|
|
export const getReportPaths = (category: string, type: 'json' | 'md'): string => {
|
|
const base = path.resolve(REPORTS_DIR, category)
|
|
return `${base}.${type}`
|
|
}
|
|
|
|
export interface TestResult {
|
|
test: string;
|
|
prompt: string;
|
|
result: string[];
|
|
expected: string;
|
|
model: string;
|
|
router: string;
|
|
timestamp: string;
|
|
passed: boolean;
|
|
reason?: string;
|
|
error?: {
|
|
message: string;
|
|
code?: string;
|
|
type?: string;
|
|
details?: any;
|
|
};
|
|
duration?: number
|
|
category?: string;
|
|
}
|
|
|
|
export interface TestHighscore {
|
|
test: string;
|
|
rankings: {
|
|
model: string;
|
|
duration: number;
|
|
duration_secs: number;
|
|
}[];
|
|
}
|
|
|
|
export const formatError = (error: any): TestResult['error'] => {
|
|
return {
|
|
message: error?.message || 'Unknown error',
|
|
code: error?.code || 'UNKNOWN',
|
|
type: error?.type || error?.constructor?.name || 'Error',
|
|
details: error?.response?.data || error?.response || error
|
|
}
|
|
}
|
|
|
|
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
|
|
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
|
|
}
|
|
|
|
export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
|
|
const highscores: TestHighscore[] = []
|
|
|
|
for (const [testName, modelResults] of latestResults) {
|
|
// Convert model results to array and sort by duration
|
|
const sortedResults = Array.from(modelResults.entries())
|
|
.map(([model, result]) => ({ model, result }))
|
|
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
|
|
.slice(0, 2) // Get top 2
|
|
|
|
if (sortedResults.length > 0) {
|
|
highscores.push({
|
|
test: testName,
|
|
rankings: sortedResults.map(({ model, result }) => ({
|
|
model,
|
|
duration: result.duration || 0,
|
|
duration_secs: (result.duration || 0) / 1000
|
|
}))
|
|
})
|
|
}
|
|
}
|
|
|
|
return highscores
|
|
}
|
|
|
|
export const runTest = async (
|
|
prompt: string,
|
|
expected: string,
|
|
testName: string,
|
|
modelName: string,
|
|
logPath: string,
|
|
mode: "completion" | "tools" | "assistant" | "custom" = "completion",
|
|
options: any = {}
|
|
): Promise<TestResult> => {
|
|
let model = 'unknown'
|
|
let router = 'openrouter'
|
|
let startTime = Date.now()
|
|
let error: TestResult['error'] | undefined
|
|
let testResult: TestResult | undefined
|
|
let defaultOptions = {
|
|
filters: 'code'
|
|
}
|
|
try {
|
|
const result = await Promise.race([
|
|
run({
|
|
prompt,
|
|
mode,
|
|
model: modelName,
|
|
path: TEST_BASE_PATH,
|
|
logs: TEST_LOGS_PATH,
|
|
preferences: TEST_PREFERENCES_PATH,
|
|
logLevel: 2,
|
|
...{ ...defaultOptions, ...options },
|
|
onRun: async (options) => {
|
|
model = options.model || 'unknown'
|
|
router = options.model as string
|
|
return options
|
|
}
|
|
}),
|
|
new Promise((_, reject) =>
|
|
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
|
)
|
|
]) as string[]
|
|
|
|
if (isEmptyResponse(result)) {
|
|
testResult = {
|
|
test: testName,
|
|
prompt,
|
|
result: [],
|
|
expected,
|
|
model,
|
|
router,
|
|
timestamp: new Date().toISOString(),
|
|
passed: false,
|
|
duration: Date.now() - startTime,
|
|
reason: 'Model returned empty response'
|
|
}
|
|
} else {
|
|
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
|
const passed = actual === expected
|
|
|
|
testResult = {
|
|
test: testName,
|
|
prompt,
|
|
result: result || [],
|
|
expected,
|
|
model,
|
|
router,
|
|
timestamp: new Date().toISOString(),
|
|
passed,
|
|
duration: Date.now() - startTime,
|
|
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
|
}
|
|
}
|
|
} catch (e) {
|
|
error = formatError(e)
|
|
testResult = {
|
|
test: testName,
|
|
prompt,
|
|
result: [],
|
|
expected,
|
|
model,
|
|
router,
|
|
timestamp: new Date().toISOString(),
|
|
passed: false,
|
|
duration: Date.now() - startTime,
|
|
error,
|
|
reason: error?.message || 'Unknown error occurred'
|
|
}
|
|
throw e
|
|
} finally {
|
|
if (testResult) {
|
|
// Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
|
|
const category = path.basename(logPath, path.extname(logPath))
|
|
|
|
// Add category to test result
|
|
testResult.category = category
|
|
|
|
// Update category-specific log
|
|
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
|
|
const updatedResults = [...(existingData.results || []), testResult]
|
|
|
|
// Group results by test and model
|
|
const latestResults = new Map<string, Map<string, TestResult>>()
|
|
updatedResults.forEach(result => {
|
|
if (!latestResults.has(result.test)) {
|
|
latestResults.set(result.test, new Map())
|
|
}
|
|
const testMap = latestResults.get(result.test)!
|
|
const existingResult = testMap.get(result.model)
|
|
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
|
testMap.set(result.model, result)
|
|
}
|
|
})
|
|
|
|
// Generate highscores
|
|
const highscores = generateHighscores(latestResults)
|
|
|
|
// Write category-specific results
|
|
write(logPath, JSON.stringify({
|
|
results: updatedResults,
|
|
highscores,
|
|
lastUpdated: new Date().toISOString()
|
|
}, null, 2))
|
|
|
|
// Update central all.json log
|
|
const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
|
|
const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
|
|
const allUpdatedResults = [...(allExistingData.results || []), testResult]
|
|
|
|
// Group all results by test and model
|
|
const allLatestResults = new Map<string, Map<string, TestResult>>()
|
|
allUpdatedResults.forEach(result => {
|
|
if (!allLatestResults.has(result.test)) {
|
|
allLatestResults.set(result.test, new Map())
|
|
}
|
|
const testMap = allLatestResults.get(result.test)!
|
|
const existingResult = testMap.get(result.model)
|
|
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
|
testMap.set(result.model, result)
|
|
}
|
|
})
|
|
|
|
// Generate highscores for all results
|
|
const allHighscores = generateHighscores(allLatestResults)
|
|
|
|
// Write all results
|
|
write(allLogPath, JSON.stringify({
|
|
results: allUpdatedResults,
|
|
highscores: allHighscores,
|
|
lastUpdated: new Date().toISOString()
|
|
}, null, 2))
|
|
}
|
|
}
|
|
return testResult
|
|
}
|
|
|
|
export const generateTestReport = (
|
|
testResults: TestResult[],
|
|
reportTitle: string,
|
|
reportPath: string
|
|
): void => {
|
|
// Group results by test and model
|
|
const latestResults = new Map<string, Map<string, TestResult>>()
|
|
|
|
// Get only the latest result for each test+model combination
|
|
testResults.forEach(result => {
|
|
if (!latestResults.has(result.test)) {
|
|
latestResults.set(result.test, new Map())
|
|
}
|
|
const testMap = latestResults.get(result.test)!
|
|
const existingResult = testMap.get(result.model)
|
|
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
|
testMap.set(result.model, result)
|
|
}
|
|
})
|
|
|
|
// Generate markdown report
|
|
let report = `# ${reportTitle}\n\n`
|
|
|
|
// Add highscore section
|
|
report += '## Highscores\n\n'
|
|
|
|
// Add regular test rankings
|
|
report += '### Performance Rankings (Duration)\n\n'
|
|
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
|
|
report += '|------|-------|--------------|--------------|\n'
|
|
|
|
Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
|
|
const sortedResults = Array.from(modelResults.entries())
|
|
.map(([model, result]) => ({
|
|
model,
|
|
duration: result.duration || 0
|
|
}))
|
|
.sort((a, b) => a.duration - b.duration)
|
|
|
|
sortedResults.forEach(({ model, duration }) => {
|
|
report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
|
|
})
|
|
})
|
|
report += '\n'
|
|
|
|
// Add summary section
|
|
report += '## Summary\n\n'
|
|
const totalTests = testResults.length
|
|
const passedTests = testResults.filter(r => r.passed).length
|
|
const failedTests = totalTests - passedTests
|
|
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
|
|
report += `- Total Tests: ${totalTests}\n`
|
|
report += `- Passed: ${passedTests}\n`
|
|
report += `- Failed: ${failedTests}\n`
|
|
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
|
|
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
|
|
|
|
// First list failed tests
|
|
report += '## Failed Tests\n\n'
|
|
let hasFailures = false
|
|
for (const [testName, modelResults] of latestResults) {
|
|
for (const [model, result] of modelResults) {
|
|
if (!result.passed) {
|
|
hasFailures = true
|
|
report += `### ${testName} - ${model}\n\n`
|
|
report += `- Prompt: \`${result.prompt}\`\n`
|
|
report += `- Expected: \`${result.expected}\`\n`
|
|
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
|
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
|
if (result.error) {
|
|
report += `- Error Type: ${result.error.type}\n`
|
|
report += `- Error Code: ${result.error.code}\n`
|
|
report += `- Error Message: ${result.error.message}\n`
|
|
if (result.error.details?.message) {
|
|
report += `- Error Details: ${result.error.details.message}\n`
|
|
}
|
|
}
|
|
report += `- Reason: ${result.reason}\n`
|
|
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!hasFailures) {
|
|
report += '*No failed tests*\n\n'
|
|
}
|
|
|
|
// Then list passed tests
|
|
report += '## Passed Tests\n\n'
|
|
let hasPassed = false
|
|
for (const [testName, modelResults] of latestResults) {
|
|
for (const [model, result] of modelResults) {
|
|
if (result.passed) {
|
|
hasPassed = true
|
|
report += `### ${testName} - ${model}\n\n`
|
|
report += `- Prompt: \`${result.prompt}\`\n`
|
|
report += `- Expected: \`${result.expected}\`\n`
|
|
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
|
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
|
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!hasPassed) {
|
|
report += '*No passed tests*\n\n'
|
|
}
|
|
|
|
// Write report to file
|
|
write(reportPath, report)
|
|
} |