525 lines
16 KiB
TypeScript
525 lines
16 KiB
TypeScript
import * as path from 'node:path'
|
|
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL, E_Mode } from '../../src/index'
|
|
import { run } from '../../src/index'
|
|
import { sync as write } from "@polymech/fs/write"
|
|
import { sync as read } from "@polymech/fs/read"
|
|
import { sync as exists } from "@polymech/fs/exists"
|
|
import { sync as mkdirp } from "mkdirp"
|
|
import { zodResponseFormat } from "openai/helpers/zod"
|
|
import { JSONSchemaToZod } from '@dmitryrechkin/json-schema-to-zod';
|
|
export enum ModelCategory {
|
|
FAST = 'fast',
|
|
LANGUAGE = 'language',
|
|
TOOL = 'tool',
|
|
ALL = 'all',
|
|
CODING = 'coding',
|
|
FILES = 'file',
|
|
TEST_EQUAL = ''
|
|
}
|
|
|
|
export enum EqualityCheck {
|
|
DEFAULT = 'default',
|
|
JSON_EQUAL = 'json_equal',
|
|
LLM_EQUAL = 'llm_equal',
|
|
NONE = 'none'
|
|
}
|
|
|
|
export type EqualityFn = (actual: string, expected: string, prompt?: string) => Promise<boolean>;
|
|
|
|
export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
|
|
[EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
|
|
return (actual || '').trim().toLowerCase() === (expected || '').trim().toLowerCase();
|
|
},
|
|
[EqualityCheck.JSON_EQUAL]: async (actual: string, expected: string): Promise<boolean> => {
|
|
try {
|
|
// we just stringify to normalize and compare
|
|
const actualJson = JSON.parse(actual.trim());
|
|
const expectedJson = JSON.parse(expected.trim());
|
|
return JSON.stringify(actualJson) === JSON.stringify(expectedJson);
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
},
|
|
[EqualityCheck.LLM_EQUAL]: async (actual: string, expected: string, prompt?: string): Promise<boolean> => {
|
|
try {
|
|
const model = getTestEqualModels()[0];
|
|
if (!model) {
|
|
return false;
|
|
}
|
|
const comparisonPrompt = `You are an assistant that judges if two AI responses are semantically equivalent for a given prompt.
|
|
The original prompt was: "${prompt}"
|
|
|
|
Response A:
|
|
"${actual}"
|
|
Response B (expected):
|
|
"${expected}"
|
|
Are these two responses semantically equivalent? Consider that minor differences in formatting (like commas, casing) or phrasing should be ignored as long as the meaning is the same. Dont comment, just return the JSON object.`;
|
|
const responseSchema = {
|
|
type: 'object',
|
|
properties: {
|
|
equivalent: {
|
|
type: 'boolean',
|
|
description: "Whether the two responses are semantically equivalent."
|
|
},
|
|
semantic_distance: {
|
|
type: 'number',
|
|
description: "The semantic distance between the two responses, on a scale of 0 to 100."
|
|
}
|
|
},
|
|
required: ['equivalent', 'semantic_distance']
|
|
};
|
|
|
|
const zodSchema = JSONSchemaToZod.convert(responseSchema);
|
|
const format = zodResponseFormat(zodSchema, 'equivalent');
|
|
|
|
const result = await run({
|
|
prompt: comparisonPrompt,
|
|
model,
|
|
mode: 'completion',
|
|
format
|
|
});
|
|
|
|
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
|
|
return false;
|
|
}
|
|
|
|
const parsedResult = JSON.parse(result[0] as string);
|
|
return parsedResult.equivalent === true;
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
},
|
|
[EqualityCheck.NONE]: async (): Promise<boolean> => {
|
|
return true;
|
|
},
|
|
};
|
|
|
|
export const getFastModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
|
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
|
|
]
|
|
}
|
|
|
|
export const getTestEqualModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
|
|
]
|
|
}
|
|
|
|
export const getCodingModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_NVIDIA_LLAMA_3_3_NEMOTRON_SUPER_49B_V1_FREE
|
|
]
|
|
}
|
|
|
|
export const getFileModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
|
|
]
|
|
}
|
|
|
|
export const getLanguageModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
|
|
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
|
|
]
|
|
}
|
|
|
|
export const getResearchModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_PERPLEXITY_SONAR_DEEP_RESEARCH
|
|
]
|
|
}
|
|
|
|
export const getToolModels = (): string[] => {
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
|
|
]
|
|
}
|
|
|
|
export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST): string[] => {
|
|
switch (category) {
|
|
case ModelCategory.FAST:
|
|
return getFastModels()
|
|
case ModelCategory.LANGUAGE:
|
|
return getLanguageModels()
|
|
case ModelCategory.TOOL:
|
|
return getToolModels()
|
|
case ModelCategory.CODING:
|
|
return getCodingModels()
|
|
case ModelCategory.FILES:
|
|
return getFileModels()
|
|
case ModelCategory.TEST_EQUAL:
|
|
return getTestEqualModels()
|
|
case ModelCategory.ALL:
|
|
default:
|
|
return [
|
|
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE,
|
|
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_7_SONNET,
|
|
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI
|
|
]
|
|
}
|
|
}
|
|
|
|
export const isOpenRouterModel = (model: string): boolean => {
|
|
return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL)
|
|
}
|
|
|
|
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
|
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
|
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
|
export const TEST_TIMEOUT = 60000 // 60 seconds timeout for API calls
|
|
|
|
// Report paths configuration
|
|
export const REPORTS_DIR = path.resolve(__dirname, './reports')
|
|
|
|
// Ensure reports directory exists
|
|
if (exists(REPORTS_DIR) !== 'directory') {
|
|
mkdirp(REPORTS_DIR)
|
|
}
|
|
|
|
export const getReportPaths = (category: string, type: 'json' | 'md'): string => {
|
|
const base = path.resolve(REPORTS_DIR, category)
|
|
return `${base}.${type}`
|
|
}
|
|
|
|
export interface TestResult {
|
|
test: string;
|
|
prompt: string;
|
|
result: string[];
|
|
expected: string;
|
|
model: string;
|
|
router: string;
|
|
timestamp: string;
|
|
passed: boolean;
|
|
reason?: string;
|
|
error?: {
|
|
message: string;
|
|
code?: string;
|
|
type?: string;
|
|
details?: any;
|
|
};
|
|
duration?: number
|
|
category?: string;
|
|
}
|
|
|
|
export interface TestHighscore {
|
|
test: string;
|
|
rankings: {
|
|
model: string;
|
|
duration: number;
|
|
duration_secs: number;
|
|
}[];
|
|
}
|
|
|
|
export const formatError = (error: any): TestResult['error'] => {
|
|
return {
|
|
message: error?.message || 'Unknown error',
|
|
code: error?.code || 'UNKNOWN',
|
|
type: error?.type || error?.constructor?.name || 'Error',
|
|
details: error?.response?.data || error?.response || error
|
|
}
|
|
}
|
|
|
|
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
|
|
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
|
|
}
|
|
|
|
export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
|
|
const highscores: TestHighscore[] = []
|
|
|
|
for (const [testName, modelResults] of latestResults) {
|
|
// Convert model results to array and sort by duration
|
|
const sortedResults = Array.from(modelResults.entries())
|
|
.map(([model, result]) => ({ model, result }))
|
|
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
|
|
.slice(0, 2) // Get top 2
|
|
|
|
if (sortedResults.length > 0) {
|
|
highscores.push({
|
|
test: testName,
|
|
rankings: sortedResults.map(({ model, result }) => ({
|
|
model,
|
|
duration: result.duration || 0,
|
|
duration_secs: (result.duration || 0) / 1000
|
|
}))
|
|
})
|
|
}
|
|
}
|
|
|
|
return highscores
|
|
}
|
|
|
|
export const runTest = async (
|
|
prompt: string,
|
|
expected: string,
|
|
testName: string,
|
|
modelName: string,
|
|
logPath: string,
|
|
mode: "completion" | "tools" | "assistant" | "custom" = "completion",
|
|
options: any = {}
|
|
): Promise<TestResult> => {
|
|
let model = 'unknown'
|
|
let router = 'openrouter'
|
|
let startTime = Date.now()
|
|
let error: TestResult['error'] | undefined
|
|
let testResult: TestResult | undefined
|
|
let defaultOptions = {
|
|
filters: 'code'
|
|
}
|
|
let format: any = null
|
|
if (options.format) {
|
|
const zodSchema = JSONSchemaToZod.convert(options.format);
|
|
format = zodResponseFormat(zodSchema, "format");
|
|
}
|
|
try {
|
|
const result = await Promise.race([
|
|
run({
|
|
prompt,
|
|
mode,
|
|
model: modelName,
|
|
path: TEST_BASE_PATH,
|
|
logs: TEST_LOGS_PATH,
|
|
preferences: TEST_PREFERENCES_PATH,
|
|
logLevel: 2,
|
|
...{ ...defaultOptions, ...options, format },
|
|
onRun: async (options) => {
|
|
model = options.model || 'unknown'
|
|
router = options.model as string
|
|
return options
|
|
}
|
|
}),
|
|
new Promise((_, reject) =>
|
|
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
|
)
|
|
]) as string[]
|
|
|
|
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
|
|
testResult = {
|
|
test: testName,
|
|
prompt,
|
|
result: [],
|
|
expected,
|
|
model,
|
|
router,
|
|
timestamp: new Date().toISOString(),
|
|
passed: false,
|
|
duration: Date.now() - startTime,
|
|
reason: 'Model returned empty response'
|
|
}
|
|
} else {
|
|
const actual = result?.[0] || ''
|
|
const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
|
|
const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
|
|
const passed = await checkFn(actual, expected, prompt)
|
|
|
|
testResult = {
|
|
test: testName,
|
|
prompt,
|
|
result: result || [],
|
|
expected,
|
|
model,
|
|
router,
|
|
timestamp: new Date().toISOString(),
|
|
passed,
|
|
duration: Date.now() - startTime,
|
|
reason: passed ? undefined : `Expected ${expected}, but got ${actual.trim()}`,
|
|
}
|
|
}
|
|
} catch (e) {
|
|
error = formatError(e)
|
|
testResult = {
|
|
test: testName,
|
|
prompt,
|
|
result: [],
|
|
expected,
|
|
model,
|
|
router,
|
|
timestamp: new Date().toISOString(),
|
|
passed: false,
|
|
duration: Date.now() - startTime,
|
|
error,
|
|
reason: error?.message || 'Unknown error occurred'
|
|
}
|
|
throw e
|
|
} finally {
|
|
if (testResult) {
|
|
// Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
|
|
const category = path.basename(logPath, path.extname(logPath))
|
|
|
|
// Add category to test result
|
|
testResult.category = category
|
|
|
|
// Update category-specific log
|
|
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
|
|
const updatedResults = [...(existingData.results || []), testResult]
|
|
|
|
// Group results by test and model
|
|
const latestResults = new Map<string, Map<string, TestResult>>()
|
|
updatedResults.forEach(result => {
|
|
if (!latestResults.has(result.test)) {
|
|
latestResults.set(result.test, new Map())
|
|
}
|
|
const testMap = latestResults.get(result.test)!
|
|
const existingResult = testMap.get(result.model)
|
|
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
|
testMap.set(result.model, result)
|
|
}
|
|
})
|
|
|
|
// Generate highscores
|
|
const highscores = generateHighscores(latestResults)
|
|
|
|
// Write category-specific results
|
|
write(logPath, JSON.stringify({
|
|
results: updatedResults,
|
|
highscores,
|
|
lastUpdated: new Date().toISOString()
|
|
}, null, 2))
|
|
|
|
// Update central all.json log
|
|
const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
|
|
const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
|
|
const allUpdatedResults = [...(allExistingData.results || []), testResult]
|
|
|
|
// Group all results by test and model
|
|
const allLatestResults = new Map<string, Map<string, TestResult>>()
|
|
allUpdatedResults.forEach(result => {
|
|
if (!allLatestResults.has(result.test)) {
|
|
allLatestResults.set(result.test, new Map())
|
|
}
|
|
const testMap = allLatestResults.get(result.test)!
|
|
const existingResult = testMap.get(result.model)
|
|
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
|
testMap.set(result.model, result)
|
|
}
|
|
})
|
|
|
|
// Generate highscores for all results
|
|
const allHighscores = generateHighscores(allLatestResults)
|
|
|
|
// Write all results
|
|
write(allLogPath, JSON.stringify({
|
|
results: allUpdatedResults,
|
|
highscores: allHighscores,
|
|
lastUpdated: new Date().toISOString()
|
|
}, null, 2))
|
|
}
|
|
}
|
|
return testResult
|
|
}
|
|
|
|
export const generateTestReport = (
|
|
testResults: TestResult[],
|
|
reportTitle: string,
|
|
reportPath: string
|
|
): void => {
|
|
// Group results by test and model
|
|
const latestResults = new Map<string, Map<string, TestResult>>()
|
|
|
|
// Get only the latest result for each test+model combination
|
|
testResults.forEach(result => {
|
|
if (!latestResults.has(result.test)) {
|
|
latestResults.set(result.test, new Map())
|
|
}
|
|
const testMap = latestResults.get(result.test)!
|
|
const existingResult = testMap.get(result.model)
|
|
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
|
|
testMap.set(result.model, result)
|
|
}
|
|
})
|
|
|
|
// Generate markdown report
|
|
let report = `# ${reportTitle}\n\n`
|
|
|
|
// Add highscore section
|
|
report += '## Highscores\n\n'
|
|
|
|
// Add regular test rankings
|
|
report += '### Performance Rankings (Duration)\n\n'
|
|
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
|
|
report += '|------|-------|--------------|--------------|\n'
|
|
|
|
Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
|
|
const sortedResults = Array.from(modelResults.entries())
|
|
.map(([model, result]) => ({
|
|
model,
|
|
duration: result.duration || 0
|
|
}))
|
|
.sort((a, b) => a.duration - b.duration)
|
|
|
|
sortedResults.forEach(({ model, duration }) => {
|
|
report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
|
|
})
|
|
})
|
|
report += '\n'
|
|
|
|
// Add summary section
|
|
report += '## Summary\n\n'
|
|
const totalTests = testResults.length
|
|
const passedTests = testResults.filter(r => r.passed).length
|
|
const failedTests = totalTests - passedTests
|
|
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
|
|
report += `- Total Tests: ${totalTests}\n`
|
|
report += `- Passed: ${passedTests}\n`
|
|
report += `- Failed: ${failedTests}\n`
|
|
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
|
|
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
|
|
|
|
// First list failed tests
|
|
report += '## Failed Tests\n\n'
|
|
let hasFailures = false
|
|
for (const [testName, modelResults] of latestResults) {
|
|
for (const [model, result] of modelResults) {
|
|
if (!result.passed) {
|
|
hasFailures = true
|
|
report += `### ${testName} - ${model}\n\n`
|
|
report += `- Prompt: \`${result.prompt}\`\n`
|
|
report += `- Expected: \`${result.expected}\`\n`
|
|
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
|
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
|
if (result.error) {
|
|
report += `- Error Type: ${result.error.type}\n`
|
|
report += `- Error Code: ${result.error.code}\n`
|
|
report += `- Error Message: ${result.error.message}\n`
|
|
if (result.error.details?.message) {
|
|
report += `- Error Details: ${result.error.details.message}\n`
|
|
}
|
|
}
|
|
report += `- Reason: ${result.reason}\n`
|
|
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!hasFailures) {
|
|
report += '*No failed tests*\n\n'
|
|
}
|
|
|
|
// Then list passed tests
|
|
report += '## Passed Tests\n\n'
|
|
let hasPassed = false
|
|
for (const [testName, modelResults] of latestResults) {
|
|
for (const [model, result] of modelResults) {
|
|
if (result.passed) {
|
|
hasPassed = true
|
|
report += `### ${testName} - ${model}\n\n`
|
|
report += `- Prompt: \`${result.prompt}\`\n`
|
|
report += `- Expected: \`${result.expected}\`\n`
|
|
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
|
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
|
|
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!hasPassed) {
|
|
report += '*No passed tests*\n\n'
|
|
}
|
|
|
|
// Write report to file
|
|
write(reportPath, report)
|
|
} |