mono/packages/kbot/tests/unit/math.test.ts

import { describe, it, expect } from 'vitest'
import { run } from '../../src/index'
import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import {
  models,
  TEST_BASE_PATH,
  TEST_LOGS_PATH,
  TEST_PREFERENCES_PATH,
  TEST_TIMEOUT,
  TestResult,
  formatError,
  isEmptyResponse
} from './commons'


const TEST_LOG_PATH = path.resolve(__dirname, './math.json')

describe('Math Capabilities', () => {
  let testResults: TestResult[] = []

  // Load existing results if any
  if (exists(TEST_LOG_PATH)) {
    const data = read(TEST_LOG_PATH, 'json')
    testResults = Array.isArray(data) ? data : []
  }

  const runMathTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
    let model = 'unknown'
    let router = 'unknown'
    let startTime = Date.now()
    let error: TestResult['error'] | undefined

    try {
      const result = await Promise.race([
        run({
          prompt,
          mode: 'completion',
          model: modelName,
          path: TEST_BASE_PATH,
          logs: TEST_LOGS_PATH,
          preferences: TEST_PREFERENCES_PATH,
          onRun: async (options) => {
            model = options.model || 'unknown'
            router = options.router || 'unknown'
            return options
          }
        }),
        new Promise((_, reject) =>
          setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
        )
      ]) as string[]

      const actual = result?.[0]?.trim()?.toLowerCase() || ''
      const passed = actual === expected && !isEmptyResponse(result)

      if (isEmptyResponse(result)) {
        throw new Error('Model returned empty response')
      }

      expect(actual).toEqual(expected)

      return {
        test: testName,
        prompt,
        result: result || [],
        expected,
        model,
        router,
        timestamp: new Date().toISOString(),
        passed,
        duration: Date.now() - startTime,
        reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
      }
    } catch (e) {
      error = formatError(e)
      throw e
    } finally {
      const testResult: TestResult = {
        test: testName,
        prompt,
        result: [],
        expected,
        model,
        router,
        timestamp: new Date().toISOString(),
        passed: false,
        duration: Date.now() - startTime,
        error,
        reason: error?.message || 'Unknown error occurred'
      }

      testResults.push(testResult)
      write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
    }
  }

  it.each(models)('should add two numbers with model %s', async (modelName) => {
    await runMathTest(
      'add 5 and 3. Return only the number, no explanation.',
      '8',
      'addition',
      modelName
    )
  })

  it.each(models)('should multiply two numbers with model %s', async (modelName) => {
    await runMathTest(
      'multiply 8 and 3. Return only the number, no explanation.',
      '24',
      'multiplication',
      modelName
    )
  })

  it.each(models)('should divide two numbers with model %s', async (modelName) => {
    await runMathTest(
      'divide 15 by 3. Return only the number, no explanation.',
      '5',
      'division',
      modelName
    )
  })

  it('should generate markdown report', () => {
    // Group results by test and model
    const latestResults = new Map<string, Map<string, TestResult>>()

    // Get only the latest result for each test+model combination
    testResults.forEach(result => {
      if (!latestResults.has(result.test)) {
        latestResults.set(result.test, new Map())
      }
      const testMap = latestResults.get(result.test)!
      const existingResult = testMap.get(result.model)
      if (!existingResult || new Date(result.timestamp) > new Date(existingResult.timestamp)) {
        testMap.set(result.model, result)
      }
    })

    // Generate markdown report
    let report = '# Math Test Results\n\n'

    // First list failed tests
    report += '## Failed Tests\n\n'
    let hasFailures = false
    for (const [testName, modelResults] of latestResults) {
      for (const [model, result] of modelResults) {
        if (!result.passed) {
          hasFailures = true
          report += `### ${testName} - ${model}\n`
          report += `- Prompt: \`${result.prompt}\`\n`
          report += `- Expected: \`${result.expected}\`\n`
          report += `- Actual: \`${result.result[0] || ''}\`\n`
          report += `- Duration: ${result.duration}ms\n`
          if (result.error) {
            report += `- Error Type: ${result.error.type}\n`
            report += `- Error Code: ${result.error.code}\n`
            report += `- Error Message: ${result.error.message}\n`
          }
          report += `- Reason: ${result.reason}\n`
          report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
        }
      }
    }

    if (!hasFailures) {
      report += '*No failed tests*\n\n'
    }

    // Then list passed tests
    report += '## Passed Tests\n\n'
    let hasPassed = false
    for (const [testName, modelResults] of latestResults) {
      for (const [model, result] of modelResults) {
        if (result.passed) {
          hasPassed = true
          report += `### ${testName} - ${model}\n`
          report += `- Prompt: \`${result.prompt}\`\n`
          report += `- Expected: \`${result.expected}\`\n`
          report += `- Actual: \`${result.result[0] || ''}\`\n`
          report += `- Duration: ${result.duration}ms\n`
          report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
        }
      }
    }

    if (!hasPassed) {
      report += '*No passed tests*\n\n'
    }

    // Write report to file
    const reportPath = path.resolve(__dirname, './math-report.md')
    write(reportPath, report)

    // Verify report was written
    expect(exists(reportPath) === 'file').toBe(true)
  })
})