tests:language - merci, habsch selbstgemacht :)

This commit is contained in:
lovebird 2025-04-01 13:10:17 +02:00
parent 91517c7ac1
commit 75af5d1a26
11 changed files with 790 additions and 418 deletions

View File

@ -3,7 +3,7 @@
"messages": [
{
"role": "user",
"content": "divide 15 by 3. Return only the number, no explanation."
"content": "translate \"no\" to French. Return only the translated word, no explanation."
},
{
"role": "user",

View File

@ -67,7 +67,7 @@
- Prompt: `add 5 and 3. Return only the number, no explanation.`
- Expected: `8`
- Actual: `8`
- Timestamp: 4/1/2025, 1:00:35 PM
- Timestamp: 4/1/2025, 1:02:55 PM
### multiplication - deepseek/deepseek-chat:free
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
@ -86,7 +86,7 @@
- Prompt: `multiply 8 and 3. Return only the number, no explanation.`
- Expected: `24`
- Actual: `24`
- Timestamp: 4/1/2025, 1:00:37 PM
- Timestamp: 4/1/2025, 1:02:57 PM
### division - deepseek/deepseek-chat:free
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
@ -105,5 +105,5 @@
- Prompt: `divide 15 by 3. Return only the number, no explanation.`
- Expected: `5`
- Actual: `5`
- Timestamp: 4/1/2025, 1:00:40 PM
- Timestamp: 4/1/2025, 1:03:00 PM

View File

@ -780,5 +780,41 @@
"router": "openrouter",
"timestamp": "2025-04-01T11:00:40.556Z",
"passed": true
},
{
"test": "addition",
"prompt": "add 5 and 3. Return only the number, no explanation.",
"result": [
"8"
],
"expected": "8",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:02:55.210Z",
"passed": true
},
{
"test": "multiplication",
"prompt": "multiply 8 and 3. Return only the number, no explanation.",
"result": [
"24"
],
"expected": "24",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:02:57.579Z",
"passed": true
},
{
"test": "division",
"prompt": "divide 15 by 3. Return only the number, no explanation.",
"result": [
"5"
],
"expected": "5",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:00.064Z",
"passed": true
}
]

View File

@ -4,11 +4,22 @@ import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
import {
models,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse,
getRouterForModel,
getApiKeyForRouter
} from './commons'
const TEST_LOG_PATH = path.resolve(__dirname, './basic.json')
describe('Basic Operations', () => {
describe('Basic Capabilities', () => {
let testResults: TestResult[] = []
// Load existing results if any
@ -17,127 +28,109 @@ describe('Basic Operations', () => {
testResults = Array.isArray(data) ? data : []
}
it.each(models)('should add two numbers with model %s', async (modelName) => {
const prompt = 'add 5 and 3. Return only the number, no explanation.'
const expected = '8'
const runBasicTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected && !isEmptyResponse(result)
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
}) as string[]
expect(actual).toEqual(expected)
const actual = result?.[0]?.trim() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
return {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
} catch (e) {
error = formatError(e)
throw e
} finally {
const testResult: TestResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred',
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
// Add test result to array
testResults.push({
test: 'addition',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
it.each(models)('should respond to "hello" with model %s', async (modelName) => {
await runBasicTest(
'say "hello"',
'hello',
'hello',
modelName
)
})
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
const prompt = 'multiply 8 and 3. Return only the number, no explanation.'
const expected = '24'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'multiplication',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
it.each(models)('should respond to "goodbye" with model %s', async (modelName) => {
await runBasicTest(
'say "goodbye"',
'goodbye',
'goodbye',
modelName
)
})
it.each(models)('should divide two numbers with model %s', async (modelName) => {
const prompt = 'divide 15 by 3. Return only the number, no explanation.'
const expected = '5'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'division',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
it.each(models)('should respond to "yes" with model %s', async (modelName) => {
await runBasicTest(
'say "yes"',
'yes',
'yes',
modelName
)
})
it('should generate markdown report', () => {
@ -157,36 +150,55 @@ describe('Basic Operations', () => {
})
// Generate markdown report
let report = '# Basic Operations Test Results\n\n'
let report = '# Basic Test Results\n\n'
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
const reportPath = path.resolve(__dirname, './basic-report.md')

View File

@ -1,6 +1,18 @@
import * as path from 'node:path'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
// Test configuration
export const TEST_CONFIG = {
openrouter: {
key: process.env.OPENROUTER_API_KEY || '',
org: process.env.OPENROUTER_ORG_ID || ''
},
openai: {
key: process.env.OPENAI_API_KEY || '',
org: process.env.OPENAI_ORG_ID || ''
}
}
export const models = [
E_OPENROUTER_MODEL_FREE.MODEL_FREE_DEEPSEEK_DEEPSEEK_CHAT_FREE,
E_OPENROUTER_MODEL_FREE.MODEL_FREE_GOOGLE_GEMINI_2_0_FLASH_EXP_FREE,
@ -10,6 +22,7 @@ export const models = [
export const TEST_BASE_PATH = path.resolve(__dirname, '../../')
export const TEST_LOGS_PATH = path.resolve(__dirname, '../../logs')
export const TEST_PREFERENCES_PATH = path.resolve(__dirname, '../../preferences.md')
export const TEST_TIMEOUT = 30000 // 30 seconds timeout for API calls
export interface TestResult {
test: string;
@ -21,4 +34,55 @@ export interface TestResult {
timestamp: string;
passed: boolean;
reason?: string;
error?: {
message: string;
code?: string;
type?: string;
details?: any;
};
duration?: number;
config?: {
apiKey?: string;
baseURL?: string;
router?: string;
};
}
export const formatError = (error: any): TestResult['error'] => {
return {
message: error?.message || 'Unknown error',
code: error?.code || 'UNKNOWN',
type: error?.type || error?.constructor?.name || 'Error',
details: error?.response?.data || error?.response || error
}
}
export const isEmptyResponse = (result: string[] | null | undefined): boolean => {
return !result || result.length === 0 || result.every(r => !r || r.trim() === '')
}
export const getRouterForModel = (model: string): string => {
if (model.startsWith('gpt-')) return 'openai'
return 'openrouter'
}
export const getApiKeyForRouter = (router: string): string => {
switch (router) {
case 'openai':
return TEST_CONFIG.openai.key
case 'openrouter':
return TEST_CONFIG.openrouter.key
default:
return ''
}
}
export const validateConfig = () => {
const missingKeys: string[] = []
if (!TEST_CONFIG.openrouter.key) missingKeys.push('OPENROUTER_API_KEY')
if (!TEST_CONFIG.openai.key) missingKeys.push('OPENAI_API_KEY')
if (missingKeys.length > 0) {
throw new Error(`Missing required environment variables: ${missingKeys.join(', ')}`)
}
}

View File

@ -2,12 +2,103 @@
## Failed Tests
### german - deepseek/deepseek-chat:free
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: ``
- Duration: 985ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Model returned empty response
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:05:50 PM
### german - google/gemini-2.0-flash-exp:free
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: ``
- Duration: 746ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Model returned empty response
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:05:51 PM
### german - gpt-4
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: ``
- Duration: 1067ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:05:52 PM
### spanish - deepseek/deepseek-chat:free
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: ``
- Duration: 678ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Model returned empty response
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:05:53 PM
### spanish - google/gemini-2.0-flash-exp:free
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: ``
- Duration: 744ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Model returned empty response
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:05:53 PM
### spanish - gpt-4
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: ``
- Duration: 1125ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:05:55 PM
### french - deepseek/deepseek-chat:free
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: ``
- Duration: 626ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Model returned empty response
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:05:55 PM
### french - gpt-4
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: ``
- Duration: 1341ms
- Reason: Unknown error occurred
- Timestamp: 4/1/2025, 1:05:57 PM
### french - google/gemini-2.0-flash-exp:free
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: ``
- Duration: 729ms
- Error Type: Error
- Error Code: UNKNOWN
- Error Message: Model returned empty response
- Reason: Model returned empty response
- Timestamp: 4/1/2025, 1:05:56 PM
## Passed Tests
### german_translation - deepseek/deepseek-chat:free
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
- Expected: `hallo`
- Actual: `Hallo`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:01 PM
### german_translation - google/gemini-2.0-flash-exp:free
@ -15,18 +106,21 @@
- Expected: `hallo`
- Actual: `Hallo
`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:02 PM
### german_translation - gpt-4
- Prompt: `translate "hello" to German. Return only the translation, no explanation.`
- Expected: `hallo`
- Actual: `Hallo`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:32 PM
### spanish_translation - deepseek/deepseek-chat:free
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
- Expected: `sí`
- Actual: `sí`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:05 PM
### spanish_translation - google/gemini-2.0-flash-exp:free
@ -34,18 +128,21 @@
- Expected: `sí`
- Actual: `sí
`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:06 PM
### spanish_translation - gpt-4
- Prompt: `translate "yes" to Spanish. Return only the translation, no explanation.`
- Expected: `sí`
- Actual: `sí`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:35 PM
### french_translation - deepseek/deepseek-chat:free
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
- Expected: `non`
- Actual: `non`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:08 PM
### french_translation - google/gemini-2.0-flash-exp:free
@ -53,61 +150,13 @@
- Expected: `non`
- Actual: `non
`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:10 PM
### french_translation - gpt-4
- Prompt: `translate "no" to French. Return only the translation, no explanation.`
- Expected: `non`
- Actual: `non`
- Duration: undefinedms
- Timestamp: 4/1/2025, 12:56:37 PM
### german - deepseek/deepseek-chat:free
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: `Hallo`
- Timestamp: 4/1/2025, 12:59:07 PM
### german - google/gemini-2.0-flash-exp:free
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: `Hallo
`
- Timestamp: 4/1/2025, 12:59:09 PM
### german - gpt-4
- Prompt: `translate "hello" to German. Return only the translated word, no explanation.`
- Expected: `hallo`
- Actual: `Hallo`
- Timestamp: 4/1/2025, 12:59:11 PM
### spanish - deepseek/deepseek-chat:free
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: `Sí`
- Timestamp: 4/1/2025, 12:59:12 PM
### spanish - google/gemini-2.0-flash-exp:free
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: `Sí
`
- Timestamp: 4/1/2025, 12:59:14 PM
### spanish - gpt-4
- Prompt: `translate "yes" to Spanish. Return only the translated word, no explanation.`
- Expected: `sí`
- Actual: `sí`
- Timestamp: 4/1/2025, 12:59:15 PM
### french - deepseek/deepseek-chat:free
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: `non`
- Timestamp: 4/1/2025, 12:59:18 PM
### french - gpt-4
- Prompt: `translate "no" to French. Return only the translated word, no explanation.`
- Expected: `non`
- Actual: `non`
- Timestamp: 4/1/2025, 12:59:20 PM

View File

@ -526,5 +526,179 @@
"router": "openrouter",
"timestamp": "2025-04-01T10:59:20.589Z",
"passed": true
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [
"Hallo"
],
"expected": "hallo",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:09.890Z",
"passed": true
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [
"sí"
],
"expected": "sí",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:12.312Z",
"passed": true
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [
"non"
],
"expected": "non",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:14.660Z",
"passed": true
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [],
"expected": "hallo",
"model": "deepseek/deepseek-chat:free",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:50.723Z",
"passed": false,
"duration": 985,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error"
},
"reason": "Model returned empty response"
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [],
"expected": "hallo",
"model": "google/gemini-2.0-flash-exp:free",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:51.471Z",
"passed": false,
"duration": 746,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error"
},
"reason": "Model returned empty response"
},
{
"test": "german",
"prompt": "translate \"hello\" to German. Return only the translated word, no explanation.",
"result": [],
"expected": "hallo",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:52.540Z",
"passed": false,
"duration": 1067,
"reason": "Unknown error occurred"
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [],
"expected": "sí",
"model": "deepseek/deepseek-chat:free",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:53.219Z",
"passed": false,
"duration": 678,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error"
},
"reason": "Model returned empty response"
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [],
"expected": "sí",
"model": "google/gemini-2.0-flash-exp:free",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:53.964Z",
"passed": false,
"duration": 744,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error"
},
"reason": "Model returned empty response"
},
{
"test": "spanish",
"prompt": "translate \"yes\" to Spanish. Return only the translated word, no explanation.",
"result": [],
"expected": "sí",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:55.090Z",
"passed": false,
"duration": 1125,
"reason": "Unknown error occurred"
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [],
"expected": "non",
"model": "deepseek/deepseek-chat:free",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:55.717Z",
"passed": false,
"duration": 626,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error"
},
"reason": "Model returned empty response"
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [],
"expected": "non",
"model": "google/gemini-2.0-flash-exp:free",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:56.447Z",
"passed": false,
"duration": 729,
"error": {
"message": "Model returned empty response",
"code": "UNKNOWN",
"type": "Error"
},
"reason": "Model returned empty response"
},
{
"test": "french",
"prompt": "translate \"no\" to French. Return only the translated word, no explanation.",
"result": [],
"expected": "non",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:05:57.790Z",
"passed": false,
"duration": 1341,
"reason": "Unknown error occurred"
}
]

View File

@ -4,7 +4,18 @@ import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
import {
models,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse,
getRouterForModel,
getApiKeyForRouter
} from './commons'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
@ -19,127 +30,109 @@ describe('Language Capabilities', () => {
testResults = Array.isArray(data) ? data : []
}
it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
const prompt = 'translate "hello" to German. Return only the translated word, no explanation.'
const expected = 'hallo'
const runTranslationTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
try {
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected && !isEmptyResponse(result)
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
}) as string[]
expect(actual).toEqual(expected)
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
return {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
} catch (e) {
error = formatError(e)
throw e
} finally {
const testResult: TestResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred',
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
// Add test result to array
testResults.push({
test: 'german',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
it.each(models)('should translate "hello" to German with model %s', async (modelName) => {
await runTranslationTest(
'translate "hello" to German. Return only the translated word, no explanation.',
'hallo',
'german',
modelName
)
})
it.each(models)('should translate "yes" to Spanish with model %s', async (modelName) => {
const prompt = 'translate "yes" to Spanish. Return only the translated word, no explanation.'
const expected = 'sí'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'spanish',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
await runTranslationTest(
'translate "yes" to Spanish. Return only the translated word, no explanation.',
'sí',
'spanish',
modelName
)
})
it.each(models)('should translate "no" to French with model %s', async (modelName) => {
const prompt = 'translate "no" to French. Return only the translated word, no explanation.'
const expected = 'non'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'french',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
await runTranslationTest(
'translate "no" to French. Return only the translated word, no explanation.',
'non',
'french',
modelName
)
})
it('should generate markdown report', () => {
@ -163,32 +156,51 @@ describe('Language Capabilities', () => {
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
const reportPath = path.resolve(__dirname, './language-report.md')

View File

@ -21,7 +21,7 @@
- Prompt: `calculate the factorial of 5 (5!). Return only the number, no explanation.`
- Expected: `120`
- Actual: `120`
- Timestamp: 4/1/2025, 12:59:11 PM
- Timestamp: 4/1/2025, 1:03:25 PM
### fibonacci - deepseek/deepseek-chat:free
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
@ -40,7 +40,7 @@
- Prompt: `calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.`
- Expected: `0,1,1,2,3`
- Actual: `0, 1, 1, 2, 3`
- Timestamp: 4/1/2025, 12:59:16 PM
- Timestamp: 4/1/2025, 1:03:27 PM
### quadratic - deepseek/deepseek-chat:free
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
@ -57,6 +57,6 @@
### quadratic - gpt-4
- Prompt: `solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.`
- Expected: `[-3,-2]`
- Actual: `["-2", "-3"]`
- Timestamp: 4/1/2025, 12:59:20 PM
- Actual: `[-2, -3]`
- Timestamp: 4/1/2025, 1:03:30 PM

View File

@ -984,5 +984,41 @@
"router": "openrouter",
"timestamp": "2025-04-01T10:59:20.963Z",
"passed": true
},
{
"test": "factorial",
"prompt": "calculate the factorial of 5 (5!). Return only the number, no explanation.",
"result": [
"120"
],
"expected": "120",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:25.009Z",
"passed": true
},
{
"test": "fibonacci",
"prompt": "calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.",
"result": [
"0, 1, 1, 2, 3"
],
"expected": "0,1,1,2,3",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:27.826Z",
"passed": true
},
{
"test": "quadratic",
"prompt": "solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.",
"result": [
"[-2, -3]"
],
"expected": "[-3,-2]",
"model": "gpt-4",
"router": "openrouter",
"timestamp": "2025-04-01T11:03:30.579Z",
"passed": true
}
]

View File

@ -4,13 +4,24 @@ import * as path from 'node:path'
import { sync as write } from "@polymech/fs/write"
import { sync as read } from "@polymech/fs/read"
import { sync as exists } from "@polymech/fs/exists"
import { models, TEST_BASE_PATH, TEST_LOGS_PATH, TEST_PREFERENCES_PATH, TestResult } from './commons'
import {
models,
TEST_BASE_PATH,
TEST_LOGS_PATH,
TEST_PREFERENCES_PATH,
TEST_TIMEOUT,
TestResult,
formatError,
isEmptyResponse,
getRouterForModel,
getApiKeyForRouter
} from './commons'
import { E_OPENROUTER_MODEL_FREE, E_OPENAI_MODEL } from '../../src/index'
const TEST_LOG_PATH = path.resolve(__dirname, './math.json')
describe('Advanced Math Operations', () => {
describe('Math Capabilities', () => {
let testResults: TestResult[] = []
// Load existing results if any
@ -19,150 +30,109 @@ describe('Advanced Math Operations', () => {
testResults = Array.isArray(data) ? data : []
}
it.each(models)('should calculate factorial of 5 with model %s', async (modelName) => {
const prompt = 'calculate the factorial of 5 (5!). Return only the number, no explanation.'
const expected = '120'
const runMathTest = async (prompt: string, expected: string, testName: string, modelName: string) => {
let model = 'unknown'
let router = 'unknown'
let startTime = Date.now()
let error: TestResult['error'] | undefined
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
const actual = result?.[0]?.trim() || ''
const passed = actual === expected
expect(actual).toEqual(expected)
// Add test result to array
testResults.push({
test: 'factorial',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
})
it.each(models)('should calculate fibonacci sequence up to 5th number with model %s', async (modelName) => {
const prompt = 'calculate the first 5 numbers of the fibonacci sequence. Return only the numbers separated by commas, no explanation.'
const expected = '0,1,1,2,3'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
// Handle both formats: "0,1,1,2,3" and "0, 1, 1, 2, 3"
const numbers = result?.[0]?.trim()?.split(',')?.map(n => n.trim()) || []
const actual = numbers.join(',')
const passed = actual === expected
expect(numbers).toEqual(['0', '1', '1', '2', '3'])
// Add test result to array
testResults.push({
test: 'fibonacci',
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !actual ? 'No result returned from model' : passed ? undefined : `Expected ${expected}, but got ${actual}`
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
})
it.each(models)('should solve quadratic equation x² + 5x + 6 = 0 with model %s', async (modelName) => {
const prompt = 'solve the quadratic equation x² + 5x + 6 = 0. Return only the roots as a JSON array, no explanation.'
const expectedDisplay = '[-3,-2]'
let model = 'unknown'
let router = 'unknown'
const result = await run({
prompt,
mode: 'completion',
model: modelName,
filters: 'code',
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}) as string[]
// Parse the result as JSON (markdown already stripped by filter)
let jsonResult: number[]
try {
const resultStr = result?.[0]?.trim() || '[]'
if (!resultStr) {
throw new Error('No result returned from model')
const result = await Promise.race([
run({
prompt,
mode: 'completion',
model: modelName,
path: TEST_BASE_PATH,
logs: TEST_LOGS_PATH,
preferences: TEST_PREFERENCES_PATH,
onRun: async (options) => {
model = options.model || 'unknown'
router = options.router || 'unknown'
return options
}
}),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('API call timed out')), TEST_TIMEOUT)
)
]) as string[]
const actual = result?.[0]?.trim()?.toLowerCase() || ''
const passed = actual === expected && !isEmptyResponse(result)
if (isEmptyResponse(result)) {
throw new Error('Model returned empty response')
}
jsonResult = JSON.parse(resultStr)
if (!Array.isArray(jsonResult)) {
throw new Error('Result is not an array')
expect(actual).toEqual(expected)
return {
test: testName,
prompt,
result: result || [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed,
duration: Date.now() - startTime,
reason: passed ? undefined : `Expected ${expected}, but got ${actual}`,
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
// Convert any string numbers to actual numbers
jsonResult = jsonResult.map(n => typeof n === 'string' ? parseFloat(n) : n)
} catch (error) {
// If parsing fails, try to extract numbers from the string
const numbers = result?.[0]?.match(/-?\d+/g)?.map(n => parseInt(n, 10)) || []
jsonResult = numbers
} catch (e) {
error = formatError(e)
throw e
} finally {
const testResult: TestResult = {
test: testName,
prompt,
result: [],
expected,
model,
router,
timestamp: new Date().toISOString(),
passed: false,
duration: Date.now() - startTime,
error,
reason: error?.message || 'Unknown error occurred',
config: {
router: getRouterForModel(modelName),
apiKey: getApiKeyForRouter(getRouterForModel(modelName))
}
}
testResults.push(testResult)
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
}
}
const actual = JSON.stringify(jsonResult.sort())
const expectedSorted = JSON.stringify([-3, -2].sort())
const passed = actual === expectedSorted
expect(jsonResult.sort()).toEqual([-3, -2].sort())
it.each(models)('should add two numbers with model %s', async (modelName) => {
await runMathTest(
'add 5 and 3. Return only the number, no explanation.',
'8',
'addition',
modelName
)
})
// Add test result to array
testResults.push({
test: 'quadratic',
prompt,
result: result || [],
expected: expectedDisplay,
model,
router,
timestamp: new Date().toISOString(),
passed,
reason: !result?.[0] ? 'No result returned from model' : passed ? undefined : `Expected ${expectedDisplay}, but got ${result?.[0] || ''}`
})
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
await runMathTest(
'multiply 8 and 3. Return only the number, no explanation.',
'24',
'multiplication',
modelName
)
})
// Write all results to the same file
write(TEST_LOG_PATH, JSON.stringify(testResults, null, 2))
it.each(models)('should divide two numbers with model %s', async (modelName) => {
await runMathTest(
'divide 15 by 3. Return only the number, no explanation.',
'5',
'division',
modelName
)
})
it('should generate markdown report', () => {
@ -186,32 +156,51 @@ describe('Advanced Math Operations', () => {
// First list failed tests
report += '## Failed Tests\n\n'
let hasFailures = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (!result.passed) {
hasFailures = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
if (result.error) {
report += `- Error Type: ${result.error.type}\n`
report += `- Error Code: ${result.error.code}\n`
report += `- Error Message: ${result.error.message}\n`
}
report += `- Reason: ${result.reason}\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasFailures) {
report += '*No failed tests*\n\n'
}
// Then list passed tests
report += '## Passed Tests\n\n'
let hasPassed = false
for (const [testName, modelResults] of latestResults) {
for (const [model, result] of modelResults) {
if (result.passed) {
hasPassed = true
report += `### ${testName} - ${model}\n`
report += `- Prompt: \`${result.prompt}\`\n`
report += `- Expected: \`${result.expected}\`\n`
report += `- Actual: \`${result.result[0] || ''}\`\n`
report += `- Duration: ${result.duration}ms\n`
report += `- Timestamp: ${new Date(result.timestamp).toLocaleString()}\n\n`
}
}
}
if (!hasPassed) {
report += '*No passed tests*\n\n'
}
// Write report to file
const reportPath = path.resolve(__dirname, './math-report.md')