mono/packages/kbot/tests/unit/commons.ts

import * as path from 'node:path'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL, E_OPENROUTER_MODEL, E_Mode } from '../../src/index'
import { run } from '../../src/index'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { sync as mkdirp } from "mkdirp"
import { zodResponseFormat } from "openai/helpers/zod"
import { JSONSchemaToZod } from '@dmitryrechkin/json-schema-to-zod';
export enum ModelCategory {
  FAST = 'fast',
  LANGUAGE = 'language',
  TOOL = 'tool',
  ALL = 'all',
  CODING = 'coding',
  FILES = 'file',
  TEST_EQUAL = ''
}

export enum EqualityCheck {
  DEFAULT = 'default',
  JSON_EQUAL = 'json_equal',
  LLM_EQUAL = 'llm_equal',
  NONE = 'none'
}

export type EqualityFn = (actual: string, expected: string, prompt?: string) => Promise<boolean>;

export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
  [EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
    return (actual || '').trim().toLowerCase() === (expected || '').trim().toLowerCase();
  },
  [EqualityCheck.JSON_EQUAL]: async (actual: string, expected: string): Promise<boolean> => {
    try {
      // we just stringify to normalize and compare
      const actualJson = JSON.parse(actual.trim());
      const expectedJson = JSON.parse(expected.trim());
      return JSON.stringify(actualJson) === JSON.stringify(expectedJson);
    } catch (e) {
      return false;
    }
  },
  [EqualityCheck.LLM_EQUAL]: async (actual: string, expected: string, prompt?: string): Promise<boolean> => {
    try {
      const model = getTestEqualModels()[0];
      if (!model) {
        return false;
      }
      const comparisonPrompt = `You are an assistant that judges if two AI responses are semantically equivalent for a given prompt.
The original prompt was: "${prompt}"

Response A:
"${actual}"
Response B (expected):
"${expected}"
Are these two responses semantically equivalent? Consider that minor differences in formatting (like commas, casing) or phrasing should be ignored as long as the meaning is the same. Dont comment, just return the JSON object.`;
      const responseSchema = {
          type: 'object',
          properties: {
              equivalent: {
                  type: 'boolean',
                  description: "Whether the two responses are semantically equivalent."
              },
              semantic_distance: {
                  type: 'number',
                  description: "The semantic distance between the two responses, on a scale of 0 to 100."
              }
          },
          required: ['equivalent', 'semantic_distance']
      };

      const zodSchema = JSONSchemaToZod.convert(responseSchema);
      const format = zodResponseFormat(zodSchema, 'equivalent');

      const result = await run({
        prompt: comparisonPrompt,
        model,
        mode: 'completion',
        format
      });

      if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
          return false;
      }

      const parsedResult = JSON.parse(result[0] as string);
      return parsedResult.equivalent === true;
    } catch (e) {
      return false;
    }
  },
  [EqualityCheck.NONE]: async (): Promise<boolean> => {
    return true;
  },
};

export const getFastModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
    E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
  ]
}

export const getTestEqualModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
  ]
}

export const getCodingModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_NVIDIA_LLAMA_3_3_NEMOTRON_SUPER_49B_V1_FREE
  ]
}

export const getFileModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
  ]
}

export const getLanguageModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_SONNET_4,
    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI,
    E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
  ]
}

export const getResearchModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_PERPLEXITY_SONAR_DEEP_RESEARCH
  ]
}

export const getToolModels = (): string[] => {
  return [
    E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
  ]
}

export const getDefaultModels = (category: ModelCategory = ModelCategory.FAST): string[] => {
  switch (category) {
    case ModelCategory.FAST:
      return getFastModels()
    case ModelCategory.LANGUAGE:
      return getLanguageModels()
    case ModelCategory.TOOL:
      return getToolModels()
    case ModelCategory.CODING:
      return getCodingModels()
    case ModelCategory.FILES:
      return getFileModels()
    case ModelCategory.TEST_EQUAL:
      return getTestEqualModels()
    case ModelCategory.ALL:
    default:
      return [
        E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE,
        E_OPENROUTER_MODEL.MODEL_ANTHROPIC_CLAUDE_3_7_SONNET,
        E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI
      ]
  }
}

export const isOpenRouterModel = (model: string): boolean => {
  return Object.values(E_OPENROUTER_MODEL).includes(model as E_OPENROUTER_MODEL)
}

export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
export const TEST_TIMEOUT = 60000 // 60 seconds timeout for API calls

// Report paths configuration
export const REPORTS_DIR = path.resolve(__dirname, './reports')

// Ensure reports directory exists
if (exists(REPORTS_DIR) !== 'directory') {
  mkdirp(REPORTS_DIR)
}

export const getReportPaths = (category: string, type: 'json' | 'md'): string => {
  const base = path.resolve(REPORTS_DIR, category)
  return `${base}.${type}`
}

export interface TestResult {
  test: string;
  prompt: string;
  result: string[];
  expected: string;
  model: string;
  router: string;
  timestamp: string;
  passed: boolean;
  reason?: string;
  error?: {
    message: string;
    code?: string;
    type?: string;
    details?: any;
  };
  duration?: number
  category?: string;
}

export interface TestHighscore {
  test: string;
  rankings: {
    model: string;
    duration: number;
    duration_secs: number;
  }[];
}

export const formatError = (error: any): TestResult['error'] => {
  return {
    message: error?.message || 'Unknown error',
    code: error?.code || 'UNKNOWN',
    type: error?.type || error?.constructor?.name || 'Error',
    details: error?.response?.data || error?.response || error
  }
}

export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
  return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
}

export const generateHighscores = (latestResults: Map<string, Map<string, TestResult>>): TestHighscore[] => {
  const highscores: TestHighscore[] = []

  for (const [testName, modelResults] of latestResults) {
    // Convert model results to array and sort by duration
    const sortedResults = Array.from(modelResults.entries())
      .map(([model, result]) => ({ model, result }))
      .sort((a, b) => (a.result.duration || 0) - (b.result.duration || 0))
      .slice(0, 2) // Get top 2

    if (sortedResults.length > 0) {
      highscores.push({
        test: testName,
        rankings: sortedResults.map(({ model, result }) => ({
          model,
          duration: result.duration || 0,
          duration_secs: (result.duration || 0) / 1000
        }))
      })
    }
  }

  return highscores
}

export const runTest = async (
  prompt: string,
  expected: string,
  testName: string,
  modelName: string,
  logPath: string,
  mode: "completion" | "tools" | "assistant" | "custom" = "completion",
  options: any = {}
): Promise<TestResult> => {
  let model = 'unknown'
  let router = 'openrouter'
  let startTime = Date.now()
  let error: TestResult['error'] | undefined
  let testResult: TestResult | undefined
  let defaultOptions = {
    filters: 'code'
  }
  let format: any = null
  if (options.format) {
    const zodSchema = JSONSchemaToZod.convert(options.format);
    format = zodResponseFormat(zodSchema, "format");
  }
  try {
    const result = await Promise.race([
      run({
        prompt,
        mode,
        model: modelName,
        path: TEST_BASE_PATH,
        logs: TEST_LOGS_PATH,
        preferences: TEST_PREFERENCES_PATH,
        logLevel: 2,
        ...{ ...defaultOptions, ...options, format },
        onRun: async (options) => {
          model = options.model || 'unknown'
          router = options.model as string
          return options
        }
      }),
      new Promise((_, reject) =>
        setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
      )
    ]) as string[]

    if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
      testResult = {
        test: testName,
        prompt,
        result: [],
        expected,
        model,
        router,
        timestamp: new Date().toISOString(),
        passed: false,
        duration: Date.now() - startTime,
        reason: 'Model returned empty response'
      }
    } else {
      const actual = result?.[0] || ''
      const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
      const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
      const passed = await checkFn(actual, expected, prompt)

      testResult = {
        test: testName,
        prompt,
        result: result || [],
        expected,
        model,
        router,
        timestamp: new Date().toISOString(),
        passed,
        duration: Date.now() - startTime,
        reason: passed ? undefined : `Expected ${expected}, but got ${actual.trim()}`,
      }
    }
  } catch (e) {
    error = formatError(e)
    testResult = {
      test: testName,
      prompt,
      result: [],
      expected,
      model,
      router,
      timestamp: new Date().toISOString(),
      passed: false,
      duration: Date.now() - startTime,
      error,
      reason: error?.message || 'Unknown error occurred'
    }
    throw e
  } finally {
    if (testResult) {
      // Extract category from logPath (e.g., 'reports/basic.json' -> 'basic')
      const category = path.basename(logPath, path.extname(logPath))

      // Add category to test result
      testResult.category = category

      // Update category-specific log
      const existingData = exists(logPath) === 'file' ? JSON.parse(read(logPath) as string) : { results: [], highscores: [] }
      const updatedResults = [...(existingData.results || []), testResult]

      // Group results by test and model
      const latestResults = new Map<string, Map<string, TestResult>>()
      updatedResults.forEach(result => {
        if (!latestResults.has(result.test)) {
          latestResults.set(result.test, new Map())
        }
        const testMap = latestResults.get(result.test)!
        const existingResult = testMap.get(result.model)
        if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
          testMap.set(result.model, result)
        }
      })

      // Generate highscores
      const highscores = generateHighscores(latestResults)

      // Write category-specific results
      write(logPath, JSON.stringify({
        results: updatedResults,
        highscores,
        lastUpdated: new Date().toISOString()
      }, null, 2))

      // Update central all.json log
      const allLogPath = path.resolve(REPORTS_DIR, 'all.json')
      const allExistingData = exists(allLogPath) === 'file' ? JSON.parse(read(allLogPath) as string) : { results: [], highscores: [] }
      const allUpdatedResults = [...(allExistingData.results || []), testResult]

      // Group all results by test and model
      const allLatestResults = new Map<string, Map<string, TestResult>>()
      allUpdatedResults.forEach(result => {
        if (!allLatestResults.has(result.test)) {
          allLatestResults.set(result.test, new Map())
        }
        const testMap = allLatestResults.get(result.test)!
        const existingResult = testMap.get(result.model)
        if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
          testMap.set(result.model, result)
        }
      })

      // Generate highscores for all results
      const allHighscores = generateHighscores(allLatestResults)

      // Write all results
      write(allLogPath, JSON.stringify({
        results: allUpdatedResults,
        highscores: allHighscores,
        lastUpdated: new Date().toISOString()
      }, null, 2))
    }
  }
  return testResult
}

export const generateTestReport = (
  testResults: TestResult[],
  reportTitle: string,
  reportPath: string
): void => {
  // Group results by test and model
  const latestResults = new Map<string, Map<string, TestResult>>()

  // Get only the latest result for each test+model combination
  testResults.forEach(result => {
    if (!latestResults.has(result.test)) {
      latestResults.set(result.test, new Map())
    }
    const testMap = latestResults.get(result.test)!
    const existingResult = testMap.get(result.model)
    if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
      testMap.set(result.model, result)
    }
  })

  // Generate markdown report
  let report = `# ${reportTitle}\n\n`

  // Add highscore section
  report += '## Highscores\n\n'

  // Add regular test rankings
  report += '### Performance Rankings (Duration)\n\n'
  report += '| Test | Model | Duration (ms) | Duration (s) |\n'
  report += '|------|-------|--------------|--------------|\n'

  Array.from(latestResults.entries()).forEach(([testName, modelResults]) => {
    const sortedResults = Array.from(modelResults.entries())
      .map(([model, result]) => ({
        model,
        duration: result.duration || 0
      }))
      .sort((a, b) => a.duration - b.duration)

    sortedResults.forEach(({ model, duration }) => {
      report += `| ${testName} | ${model} | ${duration.toFixed(0)} | ${(duration / 1000).toFixed(2)} |\n`
    })
  })
  report += '\n'

  // Add summary section
  report += '## Summary\n\n'
  const totalTests = testResults.length
  const passedTests = testResults.filter(r => r.passed).length
  const failedTests = totalTests - passedTests
  const avgDuration = testResults.reduce((sum, r) => sum + (r.duration || 0), 0) / totalTests
  report += `- Total Tests: ${totalTests}\n`
  report += `- Passed: ${passedTests}\n`
  report += `- Failed: ${failedTests}\n`
  report += `- Success Rate: ${((passedTests / totalTests) * 100).toFixed(2)}%\n`
  report += `- Average Duration: ${avgDuration.toFixed(0)}ms (${(avgDuration / 1000).toFixed(2)}s)\n\n`

  // First list failed tests
  report += '## Failed Tests\n\n'
  let hasFailures = false
  for (const [testName, modelResults] of latestResults) {
    for (const [model, result] of modelResults) {
      if (!result.passed) {
        hasFailures = true
        report += `### ${testName} - ${model}\n\n`
        report += `- Prompt: \`${result.prompt}\`\n`
        report += `- Expected: \`${result.expected}\`\n`
        report += `- Actual: \`${result.result[0] || ''}\`\n`
        report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
        if (result.error) {
          report += `- Error Type: ${result.error.type}\n`
          report += `- Error Code: ${result.error.code}\n`
          report += `- Error Message: ${result.error.message}\n`
          if (result.error.details?.message) {
            report += `- Error Details: ${result.error.details.message}\n`
          }
        }
        report += `- Reason: ${result.reason}\n`
        report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
      }
    }
  }

  if (!hasFailures) {
    report += '*No failed tests*\n\n'
  }

  // Then list passed tests
  report += '## Passed Tests\n\n'
  let hasPassed = false
  for (const [testName, modelResults] of latestResults) {
    for (const [model, result] of modelResults) {
      if (result.passed) {
        hasPassed = true
        report += `### ${testName} - ${model}\n\n`
        report += `- Prompt: \`${result.prompt}\`\n`
        report += `- Expected: \`${result.expected}\`\n`
        report += `- Actual: \`${result.result[0] || ''}\`\n`
        report += `- Duration: ${result.duration || 0}ms (${((result.duration || 0) / 1000).toFixed(2)}s)\n`
        report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
      }
    }
  }

  if (!hasPassed) {
    report += '*No passed tests*\n\n'
  }

  // Write report to file
  write(reportPath, report)
}