139 lines
4.9 KiB
TypeScript
139 lines
4.9 KiB
TypeScript
import { describe, it, expect } from 'vitest'
|
|
import * as path from 'node:path'
|
|
import { sync as exists } from "@polymech/fs/exists"
|
|
import {
|
|
getDefaultModels,
|
|
TEST_BASE_PATH,
|
|
TEST_LOGS_PATH,
|
|
TEST_PREFERENCES_PATH,
|
|
TEST_TIMEOUT,
|
|
TestResult,
|
|
runTest,
|
|
generateTestReport,
|
|
getReportPaths
|
|
} from './commons'
|
|
|
|
// Helper function to extract the final number(s) from a string
|
|
function extractFinalNumberOrNumbers(text: string | undefined, expectedCount: number = 1): string | null {
|
|
if (!text) {
|
|
return null;
|
|
}
|
|
|
|
// Basic cleaning - remove common conversational fluff and LaTeX
|
|
const cleanedText = text
|
|
.toLowerCase()
|
|
.replace(/\boxed{(.*?)}/g, '$1') // Handle LaTeX \boxed{}
|
|
.replace(/the answer is/g, '')
|
|
.replace(/solutions:/g, '')
|
|
.replace(/answer:/g, '');
|
|
|
|
// Find all numbers (integer or decimal, positive or negative)
|
|
const numbers = cleanedText.match(/-?\d+(\.\d+)?/g);
|
|
|
|
if (!numbers || numbers.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
if (expectedCount === 1) {
|
|
// Assume the last number found is the intended answer
|
|
return numbers[numbers.length - 1];
|
|
} else {
|
|
// Take the last 'expectedCount' numbers found
|
|
const relevantNumbers = numbers.slice(-expectedCount);
|
|
if (relevantNumbers.length < expectedCount) {
|
|
// Not enough numbers found at the end, return null or handle as needed
|
|
return null;
|
|
}
|
|
// Sort the extracted numbers numerically, convert back to string, join
|
|
const sortedNumberStrings = relevantNumbers
|
|
.map(Number)
|
|
.sort((a, b) => a - b)
|
|
.map(String);
|
|
return sortedNumberStrings.join(',');
|
|
}
|
|
}
|
|
|
|
// Optionally override models for this specific test file
|
|
const models = getDefaultModels()
|
|
|
|
describe('Math Operations', () => {
|
|
let testResults: TestResult[] = []
|
|
const TEST_LOG_PATH = getReportPaths('math', 'json')
|
|
const TEST_REPORT_PATH = getReportPaths('math', 'md')
|
|
|
|
it.each(models)('should solve quadratic equation with model %s', async (modelName) => {
|
|
const result = await runTest(
|
|
'Solve the quadratic equation x² + 5x + 6 = 0. Respond ONLY with the solutions as comma-separated numbers (e.g., -3,-2). No other text.',
|
|
'-3,-2', // Expected sorted result
|
|
'quadratic',
|
|
modelName,
|
|
TEST_LOG_PATH
|
|
)
|
|
testResults.push(result)
|
|
// Expecting two numbers for quadratic
|
|
const actualResult = extractFinalNumberOrNumbers(result.result[0], 2)
|
|
expect(actualResult).toEqual('-3,-2')
|
|
}, { timeout: TEST_TIMEOUT })
|
|
|
|
it.each(models)('should calculate factorial with model %s', async (modelName) => {
|
|
const result = await runTest(
|
|
'Calculate 5! (factorial of 5). Respond ONLY with the final numerical answer. No explanation, no other text.',
|
|
'120',
|
|
'factorial',
|
|
modelName,
|
|
TEST_LOG_PATH
|
|
)
|
|
testResults.push(result)
|
|
const actualResult = extractFinalNumberOrNumbers(result.result[0], 1)
|
|
expect(actualResult).toEqual('120')
|
|
}, { timeout: TEST_TIMEOUT })
|
|
|
|
it.each(models)('should calculate fibonacci sequence with model %s', async (modelName) => {
|
|
const result = await runTest(
|
|
'Calculate the 6th number in the Fibonacci sequence (assuming F(1)=1, F(2)=1). Respond ONLY with the final numerical answer. No other text.',
|
|
'8', // F(6) = 8
|
|
'fibonacci',
|
|
modelName,
|
|
TEST_LOG_PATH
|
|
)
|
|
testResults.push(result)
|
|
const actualResult = extractFinalNumberOrNumbers(result.result[0], 1)
|
|
// Allow for F(5)=5 if F(0)=0 is assumed by model, especially gpt-4o-mini
|
|
if (modelName === 'openai/gpt-4o-mini' && actualResult === '5') {
|
|
expect(actualResult).toEqual('5');
|
|
} else {
|
|
expect(actualResult).toEqual('8');
|
|
}
|
|
}, { timeout: TEST_TIMEOUT })
|
|
|
|
it.each(models)('should calculate square root with model %s', async (modelName) => {
|
|
const result = await runTest(
|
|
'Calculate the square root of 16. Respond ONLY with the final numerical answer. No other text.',
|
|
'4',
|
|
'square_root',
|
|
modelName,
|
|
TEST_LOG_PATH
|
|
)
|
|
testResults.push(result)
|
|
const actualResult = extractFinalNumberOrNumbers(result.result[0], 1)
|
|
expect(actualResult).toEqual('4')
|
|
}, { timeout: TEST_TIMEOUT })
|
|
|
|
it.each(models)('should calculate power with model %s', async (modelName) => {
|
|
const result = await runTest(
|
|
'Calculate 2 raised to the power of 3. Respond ONLY with the final numerical answer. No other text.',
|
|
'8',
|
|
'power',
|
|
modelName,
|
|
TEST_LOG_PATH
|
|
)
|
|
testResults.push(result)
|
|
const actualResult = extractFinalNumberOrNumbers(result.result[0], 1)
|
|
expect(actualResult).toEqual('8')
|
|
}, { timeout: TEST_TIMEOUT })
|
|
|
|
it('should generate markdown report', () => {
|
|
generateTestReport(testResults, 'Math Operations Test Results', TEST_REPORT_PATH)
|
|
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
|
|
})
|
|
})
|