mono/packages/kbot/tests/unit/commons.ts
2025-06-06 00:53:58 +02:00

525 lines
16 KiB
TypeScript

import * as path from 'node:path'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL, E_Mode } from '../../src/index'
import { run } from '../../src/index'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { sync as mkdirp } from "mkdirp"
import { zodResponseFormat } from "openai/helpers/zod"
import { JSONSchemaToZod } from '@dmitryrechkin/json-schema-to-zod';
export enum ModelCategory {
FAST = 'fast',
LANGUAGE = 'language',
TOOL = 'tool',
ALL = 'all',
CODING = 'coding',
FILES = 'file',
TEST_EQUAL = ''
}
export enum EqualityCheck {
DEFAULT = 'default',
JSON_EQUAL = 'json_equal',
LLM_EQUAL = 'llm_equal',
NONE = 'none'
}
export type EqualityFn = (actual: string, expected: string, prompt?: string) => Promise<boolean>;
export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
[EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
return (actual || '').trim().toLowerCase() === (expected || '').trim().toLowerCase();
},
[EqualityCheck.JSON_EQUAL]: async (actual: string, expected: string): Promise<boolean> => {
try {
// we just stringify to normalize and compare
const actualJson = JSON.parse(actual.trim());
const expectedJson = JSON.parse(expected.trim());
return JSON.stringify(actualJson) === JSON.stringify(expectedJson);
} catch (e) {
return false;
}
},
[EqualityCheck.LLM_EQUAL]: async (actual: string, expected: string, prompt?: string): Promise<boolean> => {
try {
const model = getTestEqualModels()[0];
if (!model) {
return false;
}
const comparisonPrompt = `You are an assistant that judges if two AI responses are semantically equivalent for a given prompt.
The original prompt was: "${prompt}"
Response A:
"${actual}"
Response B (expected):
"${expected}"
Are these two responses semantically equivalent? Consider that minor differences in formatting (like commas, casing) or phrasing should be ignored as long as the meaning is the same. Dont comment, just return the JSON object.`;
const responseSchema = {
type: 'object',
properties: {
equivalent: {
type: 'boolean',
description: "Whether the two responses are semantically equivalent."
},
semantic_distance: {
type: 'number',
description: "The semantic distance between the two responses, on a scale of 0 to 100."
}
},
required: ['equivalent', 'semantic_distance']
};
const zodSchema = JSONSchemaToZod.convert(responseSchema);
const format = zodResponseFormat(zodSchema, 'equivalent');
const result = await run({
prompt: comparisonPrompt,
model,
mode: 'completion',
format
});
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
return false;
}
const parsedResult = JSON.parse(result[0] as string);
return parsedResult.equivalent === true;
} catch (e) {
return false;
}
},
[EqualityCheck.NONE]: async (): Promise<boolean> => {
return true;
},
};
export const getFastModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
]
}
export const getTestEqualModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
]
}
export const getCodingModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_NVIDIA_LLAMA_3_3_NEMOTRON_SUPER_49B_V1_FREE
]
}
export const getFileModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
]
}
export const getLanguageModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
]
}
export const getResearchModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_PERPLEXITY_SONAR_DEEP_RESEARCH
]
}
export const getToolModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
]
}
export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST): string[] => {
switch (category) {
case ModelCategory.FAST:
return getFastModels()
case ModelCategory.LANGUAGE:
return getLanguageModels()
case ModelCategory.TOOL:
return getToolModels()
case ModelCategory.CODING:
return getCodingModels()
case ModelCategory.FILES:
return getFileModels()
case ModelCategory.TEST_EQUAL:
return getTestEqualModels()
case ModelCategory.ALL:
default:
return [
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE,
E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_7_SONNET,
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI
]
}
}
export const isOpenRouterModel = (model: string): boolean => {
return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL)
}
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
export const TEST_TIMEOUT = 60000 // 60 seconds timeout for API calls
// Report paths configuration
export const REPORTS_DIR = path.resolve(__dirname, './reports')
// Ensure reports directory exists
if (exists(REPORTS_DIR) !== 'directory') {
mkdirp(REPORTS_DIR)
}
export const getReportPaths = (category: string, type: 'json' | 'md'): string => {
const base = path.resolve(REPORTS_DIR, category)
return `${base}.${type}`
}
export interface TestResult {
test: string;
prompt: string;
result: string[];
expected: string;
model: string;
router: string;
timestamp: string;
passed: boolean;
reason?: string;
error?: {
message: string;
code?: string;
type?: string;
details?: any;
};
duration?: number
category?: string;
}
export interface TestHighscore {
test: string;
rankings: {
model: string;
duration: number;
duration_secs: number;
}[];
}
export const formatError = (error: any): TestResult['error'] => {
return {
message: error?.message || 'Unknown error',
code: error?.code || 'UNKNOWN',
type: error?.type || error?.constructor?.name || 'Error',
details: error?.response?.data || error?.response || error
}
}
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
}
export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
const highscores: TestHighscore[] = []
for (const [testName, modelResults] of latestResults) {
// Convert model results to array and sort by duration
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({ model, result }))
.sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
.slice(0, 2) // Get top 2
if (sortedResults.length > 0) {
highscores.push({
test: testName,
rankings: sortedResults.map(({ model, result }) => ({
model,
duration: result.duration || 0,
duration_secs: (result.duration || 0) / 1000
}))
})
}
}
return highscores
}
export const runTest = async (
prompt: string,
expected: string,
testName: string,
modelName: string,
logPath: string,
mode: "completion" | "tools" | "assistant" | "custom" = "completion",
options: any = {}
): Promise<TestResult> => {
let model = 'unknown'
let router = 'openrouter'
let startTime = Date.now()
let error: TestResult['error'] | undefined
let testResult: TestResult | undefined
let defaultOptions = {
filters: 'code'
}
let format: any = null
if (options.format) {
const zodSchema = JSONSchemaToZod.convert(options.format);
format = zodResponseFormat(zodSchema, "format");
}
try {
const result = await Promise.race([
run({
prompt,
mode,
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
logLevel: 2,
...{ ...defaultOptions, ...options, format },
onRun: async (options) => {
model = options.model || 'unknown'
router = options.model as string
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
reason: 'Model returned empty response'
}
} else {
const actual = result?.[0] || ''
const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
const passed = await checkFn(actual, expected, prompt)
testResult = {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual.trim()}`,
}
}
} catch (e) {
error = formatError(e)
testResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred'
}
throw e
} finally {
if (testResult) {
// Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
const category = path.basename(logPath, path.extname(logPath))
// Add category to test result
testResult.category = category
// Update category-specific log
const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
const updatedResults = [...(existingData.results || []), testResult]
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
updatedResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate highscores
const highscores = generateHighscores(latestResults)
// Write category-specific results
write(logPath, JSON.stringify({
results: updatedResults,
highscores,
lastUpdated: new Date().toISOString()
}, null, 2))
// Update central all.json log
const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
const allUpdatedResults = [...(allExistingData.results || []), testResult]
// Group all results by test and model
const allLatestResults = new Map<string, Map<string, TestResult>>()
allUpdatedResults.forEach(result => {
if (!allLatestResults.has(result.test)) {
allLatestResults.set(result.test, new Map())
}
const testMap = allLatestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate highscores for all results
const allHighscores = generateHighscores(allLatestResults)
// Write all results
write(allLogPath, JSON.stringify({
results: allUpdatedResults,
highscores: allHighscores,
lastUpdated: new Date().toISOString()
}, null, 2))
}
}
return testResult
}
export const generateTestReport = (
testResults: TestResult[],
reportTitle: string,
reportPath: string
): void => {
// Group results by test and model
const latestResults = new Map<string, Map<string, TestResult>>()
// Get only the latest result for each test+model combination
testResults.forEach(result => {
if (!latestResults.has(result.test)) {
latestResults.set(result.test, new Map())
}
const testMap = latestResults.get(result.test)!
const existingResult = testMap.get(result.model)
if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
testMap.set(result.model, result)
}
})
// Generate markdown report
let report = `# ${reportTitle}\n\n`
// Add highscore section
report += '## Highscores\n\n'
// Add regular test rankings
report += '### Performance Rankings (Duration)\n\n'
report += '| Test | Model | Duration (ms) | Duration (s) |\n'
report += '|------|-------|--------------|--------------|\n'
Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
const sortedResults = Array.from(modelResults.entries())
.map(([model, result]) => ({
model,
duration: result.duration || 0
}))
.sort((a, b) => a.duration - b.duration)
sortedResults.forEach(({ model, duration }) => {
report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
})
})
report += '\n'
// Add summary section
report += '## Summary\n\n'
const totalTests = testResults.length
const passedTests = testResults.filter(r => r.passed).length
const failedTests = totalTests - passedTests
const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
report += `- Total Tests: ${totalTests}\n`
report += `- Passed: ${passedTests}\n`
report += `- Failed: ${failedTests}\n`
report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
if (result.error.details?.message) {
report += `- Error Details: ${result.error.details.message}\n`
}
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
write(reportPath, report)
}