tests:language - merci, habsch selbstgemacht :)
This commit is contained in:
parent
91517c7ac1
commit
75af5d1a26
@ -3,7 +3,7 @@
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "divide 15 by 3. Return only the number, no explanation."
|
||||
"content": "translate \"no\" to French. Return only the translated word, no explanation."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -67,7 +67,7 @@
|
||||
- Prompt: `add 5 and 3. Return only the number, no explanation.`
|
||||
- Expected: `8`
|
||||
- Actual: `8`
|
||||
- Timestamp: 4/1/2025, 1:00:35 PM
|
||||
- Timestamp: 4/1/2025, 1:02:55 PM
|
||||
|
||||
### multiplication - deepseek/deepseek-chat:free
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
@ -86,7 +86,7 @@
|
||||
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
|
||||
- Expected: `24`
|
||||
- Actual: `24`
|
||||
- Timestamp: 4/1/2025, 1:00:37 PM
|
||||
- Timestamp: 4/1/2025, 1:02:57 PM
|
||||
|
||||
### division - deepseek/deepseek-chat:free
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
@ -105,5 +105,5 @@
|
||||
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
|
||||
- Expected: `5`
|
||||
- Actual: `5`
|
||||
- Timestamp: 4/1/2025, 1:00:40 PM
|
||||
- Timestamp: 4/1/2025, 1:03:00 PM
|
||||
|
||||
|
||||
@ -780,5 +780,41 @@
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:00:40.556Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "addition",
|
||||
"prompt": "add 5 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"8"
|
||||
],
|
||||
"expected": "8",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:02:55.210Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "multiplication",
|
||||
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"24"
|
||||
],
|
||||
"expected": "24",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:02:57.579Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "division",
|
||||
"prompt": "divide 15 by 3. Return only the number, no explanation.",
|
||||
"result": [
|
||||
"5"
|
||||
],
|
||||
"expected": "5",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:00.064Z",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
@ -4,11 +4,22 @@ import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
import {
|
||||
models,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse,
|
||||
getRouterForModel,
|
||||
getApiKeyForRouter
|
||||
} from './commons'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
|
||||
|
||||
describe('Basic Operations', () => {
|
||||
describe('Basic Capabilities', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
@ -17,127 +28,109 @@ describe('Basic Operations', () => {
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
|
||||
it.each(models)('should add two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'add 5 and 3. Return only the number, no explanation.'
|
||||
const expected = '8'
|
||||
const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected && !isEmptyResponse(result)
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
return {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
throw e
|
||||
} finally {
|
||||
const testResult: TestResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred',
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'addition',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
it.each(models)('should respond to "hello" with model %s', async (modelName) => {
|
||||
await runBasicTest(
|
||||
'say "hello"',
|
||||
'hello',
|
||||
'hello',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'multiply 8 and 3. Return only the number, no explanation.'
|
||||
const expected = '24'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'multiplication',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
it.each(models)('should respond to "goodbye" with model %s', async (modelName) => {
|
||||
await runBasicTest(
|
||||
'say "goodbye"',
|
||||
'goodbye',
|
||||
'goodbye',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should divide two numbers with model %s', async (modelName) => {
|
||||
const prompt = 'divide 15 by 3. Return only the number, no explanation.'
|
||||
const expected = '5'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'division',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
it.each(models)('should respond to "yes" with model %s', async (modelName) => {
|
||||
await runBasicTest(
|
||||
'say "yes"',
|
||||
'yes',
|
||||
'yes',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
@ -157,36 +150,55 @@ describe('Basic Operations', () => {
|
||||
})
|
||||
|
||||
// Generate markdown report
|
||||
let report = '# Basic Operations Test Results\n\n'
|
||||
let report = '# Basic Test Results\n\n'
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './basic-report.md')
|
||||
|
||||
@ -1,6 +1,18 @@
|
||||
import * as path from 'node:path'
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
// Test configuration
|
||||
export const TEST_CONFIG = {
|
||||
openrouter: {
|
||||
key: process.env.OPENROUTER_API_KEY || '',
|
||||
org: process.env.OPENROUTER_ORG_ID || ''
|
||||
},
|
||||
openai: {
|
||||
key: process.env.OPENAI_API_KEY || '',
|
||||
org: process.env.OPENAI_ORG_ID || ''
|
||||
}
|
||||
}
|
||||
|
||||
export const models = [
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
|
||||
E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE,
|
||||
@ -10,6 +22,7 @@ export const models = [
|
||||
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
|
||||
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
|
||||
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
|
||||
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
|
||||
|
||||
export interface TestResult {
|
||||
test: string;
|
||||
@ -21,4 +34,55 @@ export interface TestResult {
|
||||
timestamp: string;
|
||||
passed: boolean;
|
||||
reason?: string;
|
||||
error?: {
|
||||
message: string;
|
||||
code?: string;
|
||||
type?: string;
|
||||
details?: any;
|
||||
};
|
||||
duration?: number;
|
||||
config?: {
|
||||
apiKey?: string;
|
||||
baseURL?: string;
|
||||
router?: string;
|
||||
};
|
||||
}
|
||||
|
||||
export const formatError = (error: any): TestResult['error'] => {
|
||||
return {
|
||||
message: error?.message || 'Unknown error',
|
||||
code: error?.code || 'UNKNOWN',
|
||||
type: error?.type || error?.constructor?.name || 'Error',
|
||||
details: error?.response?.data || error?.response || error
|
||||
}
|
||||
}
|
||||
|
||||
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
|
||||
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
|
||||
}
|
||||
|
||||
export const getRouterForModel = (model: string): string => {
|
||||
if (model.startsWith('gpt-')) return 'openai'
|
||||
return 'openrouter'
|
||||
}
|
||||
|
||||
export const getApiKeyForRouter = (router: string): string => {
|
||||
switch (router) {
|
||||
case 'openai':
|
||||
return TEST_CONFIG.openai.key
|
||||
case 'openrouter':
|
||||
return TEST_CONFIG.openrouter.key
|
||||
default:
|
||||
return ''
|
||||
}
|
||||
}
|
||||
|
||||
export const validateConfig = () => {
|
||||
const missingKeys: string[] = []
|
||||
if (!TEST_CONFIG.openrouter.key) missingKeys.push('OPENROUTER_API_KEY')
|
||||
if (!TEST_CONFIG.openai.key) missingKeys.push('OPENAI_API_KEY')
|
||||
|
||||
if (missingKeys.length > 0) {
|
||||
throw new Error(`Missing required environment variables: ${missingKeys.join(', ')}`)
|
||||
}
|
||||
}
|
||||
@ -2,12 +2,103 @@
|
||||
|
||||
## Failed Tests
|
||||
|
||||
### german - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 985ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:50 PM
|
||||
|
||||
### german - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 746ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:51 PM
|
||||
|
||||
### german - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: ``
|
||||
- Duration: 1067ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:05:52 PM
|
||||
|
||||
### spanish - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 678ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:53 PM
|
||||
|
||||
### spanish - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 744ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:53 PM
|
||||
|
||||
### spanish - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: ``
|
||||
- Duration: 1125ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:05:55 PM
|
||||
|
||||
### french - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 626ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:55 PM
|
||||
|
||||
### french - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 1341ms
|
||||
- Reason: Unknown error occurred
|
||||
- Timestamp: 4/1/2025, 1:05:57 PM
|
||||
|
||||
### french - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: ``
|
||||
- Duration: 729ms
|
||||
- Error Type: Error
|
||||
- Error Code: UNKNOWN
|
||||
- Error Message: Model returned empty response
|
||||
- Reason: Model returned empty response
|
||||
- Timestamp: 4/1/2025, 1:05:56 PM
|
||||
|
||||
## Passed Tests
|
||||
|
||||
### german_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:01 PM
|
||||
|
||||
### german_translation - google/gemini-2.0-flash-exp:free
|
||||
@ -15,18 +106,21 @@
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:02 PM
|
||||
|
||||
### german_translation - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:32 PM
|
||||
|
||||
### spanish_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:05 PM
|
||||
|
||||
### spanish_translation - google/gemini-2.0-flash-exp:free
|
||||
@ -34,18 +128,21 @@
|
||||
- Expected: `sí`
|
||||
- Actual: `sí
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:06 PM
|
||||
|
||||
### spanish_translation - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:35 PM
|
||||
|
||||
### french_translation - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:08 PM
|
||||
|
||||
### french_translation - google/gemini-2.0-flash-exp:free
|
||||
@ -53,61 +150,13 @@
|
||||
- Expected: `non`
|
||||
- Actual: `non
|
||||
`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:10 PM
|
||||
|
||||
### french_translation - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Duration: undefinedms
|
||||
- Timestamp: 4/1/2025, 12:56:37 PM
|
||||
|
||||
### german - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Timestamp: 4/1/2025, 12:59:07 PM
|
||||
|
||||
### german - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:09 PM
|
||||
|
||||
### german - gpt-4
|
||||
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
|
||||
- Expected: `hallo`
|
||||
- Actual: `Hallo`
|
||||
- Timestamp: 4/1/2025, 12:59:11 PM
|
||||
|
||||
### spanish - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `Sí`
|
||||
- Timestamp: 4/1/2025, 12:59:12 PM
|
||||
|
||||
### spanish - google/gemini-2.0-flash-exp:free
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `Sí
|
||||
`
|
||||
- Timestamp: 4/1/2025, 12:59:14 PM
|
||||
|
||||
### spanish - gpt-4
|
||||
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
|
||||
- Expected: `sí`
|
||||
- Actual: `sí`
|
||||
- Timestamp: 4/1/2025, 12:59:15 PM
|
||||
|
||||
### french - deepseek/deepseek-chat:free
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Timestamp: 4/1/2025, 12:59:18 PM
|
||||
|
||||
### french - gpt-4
|
||||
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
|
||||
- Expected: `non`
|
||||
- Actual: `non`
|
||||
- Timestamp: 4/1/2025, 12:59:20 PM
|
||||
|
||||
|
||||
@ -526,5 +526,179 @@
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:20.589Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"Hallo"
|
||||
],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:09.890Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"sí"
|
||||
],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:12.312Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [
|
||||
"non"
|
||||
],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:14.660Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:50.723Z",
|
||||
"passed": false,
|
||||
"duration": 985,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:51.471Z",
|
||||
"passed": false,
|
||||
"duration": 746,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "german",
|
||||
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "hallo",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:52.540Z",
|
||||
"passed": false,
|
||||
"duration": 1067,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:53.219Z",
|
||||
"passed": false,
|
||||
"duration": 678,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:53.964Z",
|
||||
"passed": false,
|
||||
"duration": 744,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "spanish",
|
||||
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "sí",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:55.090Z",
|
||||
"passed": false,
|
||||
"duration": 1125,
|
||||
"reason": "Unknown error occurred"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "deepseek/deepseek-chat:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:55.717Z",
|
||||
"passed": false,
|
||||
"duration": 626,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "google/gemini-2.0-flash-exp:free",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:56.447Z",
|
||||
"passed": false,
|
||||
"duration": 729,
|
||||
"error": {
|
||||
"message": "Model returned empty response",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error"
|
||||
},
|
||||
"reason": "Model returned empty response"
|
||||
},
|
||||
{
|
||||
"test": "french",
|
||||
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
|
||||
"result": [],
|
||||
"expected": "non",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:05:57.790Z",
|
||||
"passed": false,
|
||||
"duration": 1341,
|
||||
"reason": "Unknown error occurred"
|
||||
}
|
||||
]
|
||||
@ -4,7 +4,18 @@ import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
import {
|
||||
models,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse,
|
||||
getRouterForModel,
|
||||
getApiKeyForRouter
|
||||
} from './commons'
|
||||
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
@ -19,127 +30,109 @@ describe('Language Capabilities', () => {
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
|
||||
it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
|
||||
const prompt = 'translate "hello" to German. Return only the translated word, no explanation.'
|
||||
const expected = 'hallo'
|
||||
const runTranslationTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
try {
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected && !isEmptyResponse(result)
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
return {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
throw e
|
||||
} finally {
|
||||
const testResult: TestResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred',
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'german',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
|
||||
await runTranslationTest(
|
||||
'translate "hello" to German. Return only the translated word, no explanation.',
|
||||
'hallo',
|
||||
'german',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => {
|
||||
const prompt = 'translate "yes" to Spanish. Return only the translated word, no explanation.'
|
||||
const expected = 'sí'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'spanish',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
await runTranslationTest(
|
||||
'translate "yes" to Spanish. Return only the translated word, no explanation.',
|
||||
'sí',
|
||||
'spanish',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it.each(models)('should translate "no" to French with model %s', async (modelName) => {
|
||||
const prompt = 'translate "no" to French. Return only the translated word, no explanation.'
|
||||
const expected = 'non'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'french',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
await runTranslationTest(
|
||||
'translate "no" to French. Return only the translated word, no explanation.',
|
||||
'non',
|
||||
'french',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
@ -163,32 +156,51 @@ describe('Language Capabilities', () => {
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './language-report.md')
|
||||
|
||||
@ -21,7 +21,7 @@
|
||||
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
|
||||
- Expected: `120`
|
||||
- Actual: `120`
|
||||
- Timestamp: 4/1/2025, 12:59:11 PM
|
||||
- Timestamp: 4/1/2025, 1:03:25 PM
|
||||
|
||||
### fibonacci - deepseek/deepseek-chat:free
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
@ -40,7 +40,7 @@
|
||||
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
|
||||
- Expected: `0,1,1,2,3`
|
||||
- Actual: `0, 1, 1, 2, 3`
|
||||
- Timestamp: 4/1/2025, 12:59:16 PM
|
||||
- Timestamp: 4/1/2025, 1:03:27 PM
|
||||
|
||||
### quadratic - deepseek/deepseek-chat:free
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
@ -57,6 +57,6 @@
|
||||
### quadratic - gpt-4
|
||||
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
|
||||
- Expected: `[-3,-2]`
|
||||
- Actual: `["-2", "-3"]`
|
||||
- Timestamp: 4/1/2025, 12:59:20 PM
|
||||
- Actual: `[-2, -3]`
|
||||
- Timestamp: 4/1/2025, 1:03:30 PM
|
||||
|
||||
|
||||
@ -984,5 +984,41 @@
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T10:59:20.963Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "factorial",
|
||||
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
|
||||
"result": [
|
||||
"120"
|
||||
],
|
||||
"expected": "120",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:25.009Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "fibonacci",
|
||||
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
|
||||
"result": [
|
||||
"0, 1, 1, 2, 3"
|
||||
],
|
||||
"expected": "0,1,1,2,3",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:27.826Z",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"test": "quadratic",
|
||||
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
|
||||
"result": [
|
||||
"[-2, -3]"
|
||||
],
|
||||
"expected": "[-3,-2]",
|
||||
"model": "gpt-4",
|
||||
"router": "openrouter",
|
||||
"timestamp": "2025-04-01T11:03:30.579Z",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
@ -4,13 +4,24 @@ import * as path from 'node:path'
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
|
||||
import {
|
||||
models,
|
||||
TEST_BASE_PATH,
|
||||
TEST_LOGS_PATH,
|
||||
TEST_PREFERENCES_PATH,
|
||||
TEST_TIMEOUT,
|
||||
TestResult,
|
||||
formatError,
|
||||
isEmptyResponse,
|
||||
getRouterForModel,
|
||||
getApiKeyForRouter
|
||||
} from './commons'
|
||||
|
||||
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
|
||||
|
||||
const TEST_LOG_PATH = path.resolve(__dirname, './math.json')
|
||||
|
||||
describe('Advanced Math Operations', () => {
|
||||
describe('Math Capabilities', () => {
|
||||
let testResults: TestResult[] = []
|
||||
|
||||
// Load existing results if any
|
||||
@ -19,150 +30,109 @@ describe('Advanced Math Operations', () => {
|
||||
testResults = Array.isArray(data) ? data : []
|
||||
}
|
||||
|
||||
it.each(models)('should calculate factorial of 5 with model %s', async (modelName) => {
|
||||
const prompt = 'calculate the factorial of 5 (5!). Return only the number, no explanation.'
|
||||
const expected = '120'
|
||||
const runMathTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
let startTime = Date.now()
|
||||
let error: TestResult['error'] | undefined
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim() || ''
|
||||
const passed = actual === expected
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'factorial',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it.each(models)('should calculate fibonacci sequence up to 5th number with model %s', async (modelName) => {
|
||||
const prompt = 'calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.'
|
||||
const expected = '0,1,1,2,3'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
// Handle both formats: "0,1,1,2,3" and "0, 1, 1, 2, 3"
|
||||
const numbers = result?.[0]?.trim()?.split(',')?.map(n => n.trim()) || []
|
||||
const actual = numbers.join(',')
|
||||
const passed = actual === expected
|
||||
expect(numbers).toEqual(['0', '1', '1', '2', '3'])
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'fibonacci',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
})
|
||||
|
||||
it.each(models)('should solve quadratic equation x² + 5x + 6 = 0 with model %s', async (modelName) => {
|
||||
const prompt = 'solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.'
|
||||
const expectedDisplay = '[-3,-2]'
|
||||
let model = 'unknown'
|
||||
let router = 'unknown'
|
||||
|
||||
const result = await run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
filters: 'code',
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}) as string[]
|
||||
|
||||
// Parse the result as JSON (markdown already stripped by filter)
|
||||
let jsonResult: number[]
|
||||
try {
|
||||
const resultStr = result?.[0]?.trim() || '[]'
|
||||
if (!resultStr) {
|
||||
throw new Error('No result returned from model')
|
||||
const result = await Promise.race([
|
||||
run({
|
||||
prompt,
|
||||
mode: 'completion',
|
||||
model: modelName,
|
||||
path: TEST_BASE_PATH,
|
||||
logs: TEST_LOGS_PATH,
|
||||
preferences: TEST_PREFERENCES_PATH,
|
||||
onRun: async (options) => {
|
||||
model = options.model || 'unknown'
|
||||
router = options.router || 'unknown'
|
||||
return options
|
||||
}
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
const actual = result?.[0]?.trim()?.toLowerCase() || ''
|
||||
const passed = actual === expected && !isEmptyResponse(result)
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
throw new Error('Model returned empty response')
|
||||
}
|
||||
jsonResult = JSON.parse(resultStr)
|
||||
if (!Array.isArray(jsonResult)) {
|
||||
throw new Error('Result is not an array')
|
||||
|
||||
expect(actual).toEqual(expected)
|
||||
|
||||
return {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
duration: Date.now() - startTime,
|
||||
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
// Convert any string numbers to actual numbers
|
||||
jsonResult = jsonResult.map(n => typeof n === 'string' ? parseFloat(n) : n)
|
||||
} catch (error) {
|
||||
// If parsing fails, try to extract numbers from the string
|
||||
const numbers = result?.[0]?.match(/-?\d+/g)?.map(n => parseInt(n, 10)) || []
|
||||
jsonResult = numbers
|
||||
} catch (e) {
|
||||
error = formatError(e)
|
||||
throw e
|
||||
} finally {
|
||||
const testResult: TestResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
result: [],
|
||||
expected,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed: false,
|
||||
duration: Date.now() - startTime,
|
||||
error,
|
||||
reason: error?.message || 'Unknown error occurred',
|
||||
config: {
|
||||
router: getRouterForModel(modelName),
|
||||
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
|
||||
}
|
||||
}
|
||||
|
||||
testResults.push(testResult)
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
}
|
||||
}
|
||||
|
||||
const actual = JSON.stringify(jsonResult.sort())
|
||||
const expectedSorted = JSON.stringify([-3, -2].sort())
|
||||
const passed = actual === expectedSorted
|
||||
expect(jsonResult.sort()).toEqual([-3, -2].sort())
|
||||
it.each(models)('should add two numbers with model %s', async (modelName) => {
|
||||
await runMathTest(
|
||||
'add 5 and 3. Return only the number, no explanation.',
|
||||
'8',
|
||||
'addition',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
// Add test result to array
|
||||
testResults.push({
|
||||
test: 'quadratic',
|
||||
prompt,
|
||||
result: result || [],
|
||||
expected: expectedDisplay,
|
||||
model,
|
||||
router,
|
||||
timestamp: new Date().toISOString(),
|
||||
passed,
|
||||
reason: !result?.[0] ? 'No result returned from model' : passed ? undefined : `Expected ${expectedDisplay}, but got ${result?.[0] || ''}`
|
||||
})
|
||||
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
|
||||
await runMathTest(
|
||||
'multiply 8 and 3. Return only the number, no explanation.',
|
||||
'24',
|
||||
'multiplication',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
// Write all results to the same file
|
||||
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
|
||||
it.each(models)('should divide two numbers with model %s', async (modelName) => {
|
||||
await runMathTest(
|
||||
'divide 15 by 3. Return only the number, no explanation.',
|
||||
'5',
|
||||
'division',
|
||||
modelName
|
||||
)
|
||||
})
|
||||
|
||||
it('should generate markdown report', () => {
|
||||
@ -186,32 +156,51 @@ describe('Advanced Math Operations', () => {
|
||||
|
||||
// First list failed tests
|
||||
report += '## Failed Tests\n\n'
|
||||
let hasFailures = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (!result.passed) {
|
||||
hasFailures = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
if (result.error) {
|
||||
report += `- Error Type: ${result.error.type}\n`
|
||||
report += `- Error Code: ${result.error.code}\n`
|
||||
report += `- Error Message: ${result.error.message}\n`
|
||||
}
|
||||
report += `- Reason: ${result.reason}\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFailures) {
|
||||
report += '*No failed tests*\n\n'
|
||||
}
|
||||
|
||||
// Then list passed tests
|
||||
report += '## Passed Tests\n\n'
|
||||
let hasPassed = false
|
||||
for (const [testName, modelResults] of latestResults) {
|
||||
for (const [model, result] of modelResults) {
|
||||
if (result.passed) {
|
||||
hasPassed = true
|
||||
report += `### ${testName} - ${model}\n`
|
||||
report += `- Prompt: \`${result.prompt}\`\n`
|
||||
report += `- Expected: \`${result.expected}\`\n`
|
||||
report += `- Actual: \`${result.result[0] || ''}\`\n`
|
||||
report += `- Duration: ${result.duration}ms\n`
|
||||
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasPassed) {
|
||||
report += '*No passed tests*\n\n'
|
||||
}
|
||||
|
||||
// Write report to file
|
||||
const reportPath = path.resolve(__dirname, './math-report.md')
|
||||
|
||||
Loading…
Reference in New Issue
Block a user