kbot llm eq cmp :):
This commit is contained in:
parent
1535cb754a
commit
c5eadead31
@ -1,5 +1,5 @@
|
||||
{
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -24,7 +24,7 @@ export enum EqualityCheck {
|
||||
NONE = 'none'
|
||||
}
|
||||
|
||||
export type EqualityFn = (actual: string, expected: string) => Promise<boolean>;
|
||||
export type EqualityFn = (actual: string, expected: string, prompt?: string) => Promise<boolean>;
|
||||
|
||||
export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
|
||||
[EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
|
||||
@ -40,6 +40,55 @@ export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
[EqualityCheck.LLM_EQUAL]: async (actual: string, expected: string, prompt?: string): Promise<boolean> => {
|
||||
try {
|
||||
const model = getTestEqualModels()[0];
|
||||
if (!model) {
|
||||
return false;
|
||||
}
|
||||
const comparisonPrompt = `You are an assistant that judges if two AI responses are semantically equivalent for a given prompt.
|
||||
The original prompt was: "${prompt}"
|
||||
|
||||
Response A:
|
||||
"${actual}"
|
||||
Response B (expected):
|
||||
"${expected}"
|
||||
Are these two responses semantically equivalent? Consider that minor differences in formatting (like commas, casing) or phrasing should be ignored as long as the meaning is the same. Dont comment, just return the JSON object.`;
|
||||
const responseSchema = {
|
||||
type: 'object',
|
||||
properties: {
|
||||
equivalent: {
|
||||
type: 'boolean',
|
||||
description: "Whether the two responses are semantically equivalent."
|
||||
},
|
||||
semantic_distance: {
|
||||
type: 'number',
|
||||
description: "The semantic distance between the two responses, on a scale of 0 to 100."
|
||||
}
|
||||
},
|
||||
required: ['equivalent', 'semantic_distance']
|
||||
};
|
||||
|
||||
const zodSchema = JSONSchemaToZod.convert(responseSchema);
|
||||
const format = zodResponseFormat(zodSchema, 'equivalent');
|
||||
|
||||
const result = await run({
|
||||
prompt: comparisonPrompt,
|
||||
model,
|
||||
mode: 'completion',
|
||||
format
|
||||
});
|
||||
|
||||
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const parsedResult = JSON.parse(result[0] as string);
|
||||
return parsedResult.equivalent === true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
[EqualityCheck.NONE]: async (): Promise<boolean> => {
|
||||
return true;
|
||||
},
|
||||
@ -55,7 +104,7 @@ export const getFastModels = (): string[] => {
|
||||
|
||||
export const getTestEqualModels = (): string[] => {
|
||||
return [
|
||||
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
|
||||
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
|
||||
]
|
||||
}
|
||||
|
||||
@ -248,7 +297,7 @@ export const runTest = async (
|
||||
)
|
||||
]) as string[]
|
||||
|
||||
if (isEmptyResponse(result)) {
|
||||
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
|
||||
testResult = {
|
||||
test: testName,
|
||||
prompt,
|
||||
@ -265,7 +314,7 @@ export const runTest = async (
|
||||
const actual = result?.[0] || ''
|
||||
const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
|
||||
const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
|
||||
const passed = await checkFn(actual, expected)
|
||||
const passed = await checkFn(actual, expected, prompt)
|
||||
|
||||
testResult = {
|
||||
test: testName,
|
||||
|
||||
@ -10,7 +10,8 @@ import {
|
||||
TestResult,
|
||||
runTest,
|
||||
generateTestReport,
|
||||
getReportPaths
|
||||
getReportPaths,
|
||||
EqualityCheck
|
||||
} from './commons'
|
||||
|
||||
// Optionally override models for this specific test file
|
||||
@ -27,6 +28,22 @@ describe('Language Operations', () => {
|
||||
const TEST_LOG_PATH = getReportPaths('language', 'json')
|
||||
const TEST_REPORT_PATH = getReportPaths('language', 'md')
|
||||
|
||||
|
||||
it.each(models)('should summarize text with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.',
|
||||
'A fox jumps over a dog',
|
||||
'summarization',
|
||||
modelName,
|
||||
TEST_LOG_PATH,
|
||||
'completion',
|
||||
{ equalityCheck: EqualityCheck.LLM_EQUAL }
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(result.passed).toBe(true)
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
|
||||
it.each(models)('should translate text with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Translate "Hello, world!" to Spanish. Return only the translation, no explanation.',
|
||||
@ -51,18 +68,6 @@ describe('Language Operations', () => {
|
||||
expect(normalize(result.result[0])).toEqual(normalize('i went to the store yesterday'))
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should summarize text with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.',
|
||||
'A fox jumps over a dog',
|
||||
'summarization',
|
||||
modelName,
|
||||
TEST_LOG_PATH
|
||||
)
|
||||
testResults.push(result)
|
||||
expect(normalize(result.result[0])).toEqual(normalize('a fox jumps over a dog'))
|
||||
}, { timeout: TEST_TIMEOUT })
|
||||
|
||||
it.each(models)('should identify language with model %s', async (modelName) => {
|
||||
const result = await runTest(
|
||||
'Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.',
|
||||
|
||||
@ -4757,6 +4757,330 @@
|
||||
"passed": true,
|
||||
"duration": 3836,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:41:10.838Z",
|
||||
"passed": true,
|
||||
"duration": 1465,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:41:11.649Z",
|
||||
"passed": true,
|
||||
"duration": 805,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:41:16.717Z",
|
||||
"passed": true,
|
||||
"duration": 5063,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:41:17.368Z",
|
||||
"passed": false,
|
||||
"duration": 646,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"\"I went to the store yesterday.\""
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:41:18.259Z",
|
||||
"passed": false,
|
||||
"duration": 886,
|
||||
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:41:23.006Z",
|
||||
"passed": false,
|
||||
"duration": 4742,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A brown fox leaps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:41:32.126Z",
|
||||
"passed": false,
|
||||
"duration": 9115,
|
||||
"reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A brown fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:44:29.781Z",
|
||||
"passed": false,
|
||||
"duration": 6689,
|
||||
"reason": "Expected A fox jumps over a dog, but got A brown fox jumps over a dog.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:45:17.372Z",
|
||||
"passed": true,
|
||||
"duration": 47581,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:52:22.430Z",
|
||||
"passed": true,
|
||||
"duration": 27328,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox leaps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:52:30.984Z",
|
||||
"passed": true,
|
||||
"duration": 8548,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:31.003Z",
|
||||
"passed": false,
|
||||
"duration": 60014,
|
||||
"error": {
|
||||
"message": "API call timed out",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error",
|
||||
"details": {
|
||||
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:296:33)\n at listOnTimeout (node:internal/timers:588:17)\n at processTimers (node:internal/timers:523:7)",
|
||||
"message": "API call timed out"
|
||||
}
|
||||
},
|
||||
"reason": "API call timed out",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:32.418Z",
|
||||
"passed": true,
|
||||
"duration": 1408,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:53:33.063Z",
|
||||
"passed": true,
|
||||
"duration": 639,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:35.309Z",
|
||||
"passed": true,
|
||||
"duration": 2241,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:35.959Z",
|
||||
"passed": false,
|
||||
"duration": 645,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"\"I went to the store yesterday.\""
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:53:36.606Z",
|
||||
"passed": false,
|
||||
"duration": 641,
|
||||
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:41.701Z",
|
||||
"passed": false,
|
||||
"duration": 5090,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:42.454Z",
|
||||
"passed": true,
|
||||
"duration": 747,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:53:43.116Z",
|
||||
"passed": true,
|
||||
"duration": 657,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:47.420Z",
|
||||
"passed": true,
|
||||
"duration": 4299,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "synonyms",
|
||||
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
||||
"result": [
|
||||
"Joyful"
|
||||
],
|
||||
"expected": "joyful",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:48.762Z",
|
||||
"passed": true,
|
||||
"duration": 1336,
|
||||
"category": "language"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
@ -4898,15 +5222,15 @@
|
||||
{
|
||||
"test": "translation",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 639,
|
||||
"duration_secs": 0.639
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 818,
|
||||
"duration_secs": 0.818
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1451,
|
||||
"duration_secs": 1.451
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -4920,8 +5244,8 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 695,
|
||||
"duration_secs": 0.695
|
||||
"duration": 641,
|
||||
"duration_secs": 0.641
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -4935,8 +5259,8 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 692,
|
||||
"duration_secs": 0.692
|
||||
"duration": 8548,
|
||||
"duration_secs": 8.548
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -4945,8 +5269,8 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 459,
|
||||
"duration_secs": 0.459
|
||||
"duration": 657,
|
||||
"duration_secs": 0.657
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
@ -5171,5 +5495,5 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-06-05T22:31:00.927Z"
|
||||
"lastUpdated": "2025-06-05T22:53:48.765Z"
|
||||
}
|
||||
@ -1667,21 +1667,345 @@
|
||||
"passed": true,
|
||||
"duration": 3836,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:41:10.838Z",
|
||||
"passed": true,
|
||||
"duration": 1465,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:41:11.649Z",
|
||||
"passed": true,
|
||||
"duration": 805,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:41:16.717Z",
|
||||
"passed": true,
|
||||
"duration": 5063,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:41:17.368Z",
|
||||
"passed": false,
|
||||
"duration": 646,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"\"I went to the store yesterday.\""
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:41:18.259Z",
|
||||
"passed": false,
|
||||
"duration": 886,
|
||||
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:41:23.006Z",
|
||||
"passed": false,
|
||||
"duration": 4742,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A brown fox leaps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:41:32.126Z",
|
||||
"passed": false,
|
||||
"duration": 9115,
|
||||
"reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A brown fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:44:29.781Z",
|
||||
"passed": false,
|
||||
"duration": 6689,
|
||||
"reason": "Expected A fox jumps over a dog, but got A brown fox jumps over a dog.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:45:17.372Z",
|
||||
"passed": true,
|
||||
"duration": 47581,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox jumps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:52:22.430Z",
|
||||
"passed": true,
|
||||
"duration": 27328,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [
|
||||
"A fox leaps over a dog."
|
||||
],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:52:30.984Z",
|
||||
"passed": true,
|
||||
"duration": 8548,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "summarization",
|
||||
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
|
||||
"result": [],
|
||||
"expected": "A fox jumps over a dog",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:31.003Z",
|
||||
"passed": false,
|
||||
"duration": 60014,
|
||||
"error": {
|
||||
"message": "API call timed out",
|
||||
"code": "UNKNOWN",
|
||||
"type": "Error",
|
||||
"details": {
|
||||
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:296:33)\n at listOnTimeout (node:internal/timers:588:17)\n at processTimers (node:internal/timers:523:7)",
|
||||
"message": "API call timed out"
|
||||
}
|
||||
},
|
||||
"reason": "API call timed out",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:32.418Z",
|
||||
"passed": true,
|
||||
"duration": 1408,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:53:33.063Z",
|
||||
"passed": true,
|
||||
"duration": 639,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "translation",
|
||||
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
||||
"result": [
|
||||
"¡Hola, mundo!"
|
||||
],
|
||||
"expected": "¡Hola, mundo!",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:35.309Z",
|
||||
"passed": true,
|
||||
"duration": 2241,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:35.959Z",
|
||||
"passed": false,
|
||||
"duration": 645,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"\"I went to the store yesterday.\""
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:53:36.606Z",
|
||||
"passed": false,
|
||||
"duration": 641,
|
||||
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "grammar",
|
||||
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
||||
"result": [
|
||||
"I went to the store yesterday."
|
||||
],
|
||||
"expected": "I went to the store yesterday",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:41.701Z",
|
||||
"passed": false,
|
||||
"duration": 5090,
|
||||
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:42.454Z",
|
||||
"passed": true,
|
||||
"duration": 747,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"router": "openai/gpt-4o-mini",
|
||||
"timestamp": "2025-06-05T22:53:43.116Z",
|
||||
"passed": true,
|
||||
"duration": 657,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "language_detection",
|
||||
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
||||
"result": [
|
||||
"French"
|
||||
],
|
||||
"expected": "French",
|
||||
"model": "deepseek/deepseek-r1:free",
|
||||
"router": "deepseek/deepseek-r1:free",
|
||||
"timestamp": "2025-06-05T22:53:47.420Z",
|
||||
"passed": true,
|
||||
"duration": 4299,
|
||||
"category": "language"
|
||||
},
|
||||
{
|
||||
"test": "synonyms",
|
||||
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
||||
"result": [
|
||||
"Joyful"
|
||||
],
|
||||
"expected": "joyful",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
"router": "anthropic/claude-sonnet-4",
|
||||
"timestamp": "2025-06-05T22:53:48.762Z",
|
||||
"passed": true,
|
||||
"duration": 1336,
|
||||
"category": "language"
|
||||
}
|
||||
],
|
||||
"highscores": [
|
||||
{
|
||||
"test": "translation",
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 639,
|
||||
"duration_secs": 0.639
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
"duration": 818,
|
||||
"duration_secs": 0.818
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 1451,
|
||||
"duration_secs": 1.451
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1695,8 +2019,8 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 695,
|
||||
"duration_secs": 0.695
|
||||
"duration": 641,
|
||||
"duration_secs": 0.641
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1710,8 +2034,8 @@
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 692,
|
||||
"duration_secs": 0.692
|
||||
"duration": 8548,
|
||||
"duration_secs": 8.548
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1720,8 +2044,8 @@
|
||||
"rankings": [
|
||||
{
|
||||
"model": "openai/gpt-4o-mini",
|
||||
"duration": 459,
|
||||
"duration_secs": 0.459
|
||||
"duration": 657,
|
||||
"duration_secs": 0.657
|
||||
},
|
||||
{
|
||||
"model": "openai/gpt-3.5-turbo",
|
||||
@ -1746,5 +2070,5 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"lastUpdated": "2025-06-05T22:31:00.924Z"
|
||||
"lastUpdated": "2025-06-05T22:53:48.763Z"
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user