grid search

This commit is contained in:
lovebird 2026-03-19 17:44:22 +01:00
parent 561ec84eef
commit 2c69a898dc

View File

@ -2,7 +2,7 @@ import { describe, it, expect } from 'vitest'
import { sync as exists } from "@polymech/fs/exists"
import { z } from 'zod'
import {
import {
TEST_TIMEOUT,
TestResult,
runTest,
@ -71,10 +71,10 @@ describe('Ollama Basic Operations', () => {
const TEST_LOG_PATH = getReportPaths('ollama-basics', 'json')
const TEST_REPORT_PATH = getReportPaths('ollama-basics', 'md')
it.each(models)('should add two numbers with model %s', async (modelName) => {
it.each(models)('should add two numbers with model %s', { timeout: TEST_TIMEOUT }, async (modelName) => {
const result = await runTest(
'add 5 and 3. Return only the number, no explanation.',
'8',
'8',
'addition',
modelName,
TEST_LOG_PATH,
@ -83,9 +83,9 @@ describe('Ollama Basic Operations', () => {
)
testResults.push(result)
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('8')
}, { timeout: TEST_TIMEOUT })
})
it.each(models)('should multiply two numbers with model %s', async (modelName) => {
it.each(models)('should multiply two numbers with model %s', { timeout: TEST_TIMEOUT }, async (modelName) => {
const result = await runTest(
'multiply 8 and 3. Return only the number, no explanation.',
'24',
@ -97,9 +97,9 @@ describe('Ollama Basic Operations', () => {
)
testResults.push(result)
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('24')
}, { timeout: TEST_TIMEOUT })
})
it.each(models)('should divide two numbers with model %s', async (modelName) => {
it.each(models)('should divide two numbers with model %s', { timeout: TEST_TIMEOUT }, async (modelName) => {
const result = await runTest(
'divide 15 by 3. Return only the number, no explanation.',
'5',
@ -111,7 +111,7 @@ describe('Ollama Basic Operations', () => {
)
testResults.push(result)
expect(result.result[0]?.trim()?.toLowerCase()).toEqual('5')
}, { timeout: TEST_TIMEOUT })
})
it('should generate markdown report', () => {
generateTestReport(testResults, 'Ollama Basic Operations Test Results', TEST_REPORT_PATH)
@ -130,6 +130,7 @@ describe('Ollama Custom Tool Call Quality', () => {
it.each(models)(
'should call add tool and return correct sum [%s]',
{ timeout: TEST_TIMEOUT },
async (modelName) => {
const result = await runTest(
'Use the add tool to add 17 and 25. Report back the result.',
@ -145,14 +146,13 @@ describe('Ollama Custom Tool Call Quality', () => {
}
)
testResults.push(result)
// Result must contain 42
expect(result.result[0]).toMatch(/42/)
},
{ timeout: TEST_TIMEOUT }
}
)
it.each(models)(
'should call multiply tool and return correct product [%s]',
{ timeout: TEST_TIMEOUT },
async (modelName) => {
const result = await runTest(
'Use the multiply tool to compute 6 times 7. Tell me the answer.',
@ -169,12 +169,12 @@ describe('Ollama Custom Tool Call Quality', () => {
)
testResults.push(result)
expect(result.result[0]).toMatch(/42/)
},
{ timeout: TEST_TIMEOUT }
}
)
it.each(models)(
'should call get_weather tool with correct city argument [%s]',
{ timeout: TEST_TIMEOUT },
async (modelName) => {
const result = await runTest(
"What's the weather like in Paris? Use the get_weather tool.",
@ -190,15 +190,14 @@ describe('Ollama Custom Tool Call Quality', () => {
}
)
testResults.push(result)
// Response must mention the mocked condition "sunny" and/or 22°C
const lower = result.result[0]?.toLowerCase() ?? ''
expect(lower).toMatch(/sunny|22/)
},
{ timeout: TEST_TIMEOUT }
}
)
it.each(models)(
'should select the correct tool from multiple available tools [%s]',
{ timeout: TEST_TIMEOUT },
async (modelName) => {
const result = await runTest(
'Use the appropriate tool to add 100 and 200.',
@ -209,19 +208,18 @@ describe('Ollama Custom Tool Call Quality', () => {
'tools',
{
router: 'ollama',
// Both tools available — model must pick add, not multiply
customTools: [addTool, multiplyTool, getWeatherTool],
equalityCheck: 'llm_equal',
}
)
testResults.push(result)
expect(result.result[0]).toMatch(/300/)
},
{ timeout: TEST_TIMEOUT }
}
)
it.each(models)(
'should chain two tool calls: multiply then format [%s]',
{ timeout: TEST_TIMEOUT },
async (modelName) => {
const result = await runTest(
'First multiply 123 by 456, then format the result with 2 decimal places.',
@ -239,8 +237,7 @@ describe('Ollama Custom Tool Call Quality', () => {
testResults.push(result)
// 123 * 456 = 56088 → formatted as 56,088.00
expect(result.result[0]).toMatch(/56[,.]?088/)
},
{ timeout: TEST_TIMEOUT }
}
)
it('should generate tool quality markdown report', () => {
@ -248,4 +245,3 @@ describe('Ollama Custom Tool Call Quality', () => {
expect(exists(TEST_REPORT_PATH) === 'file').toBe(true)
})
})