kbot llm eq cmp :):

This commit is contained in:
lovebird 2025-06-06 00:53:58 +02:00
parent 1535cb754a
commit c5eadead31
5 changed files with 744 additions and 42 deletions

View File

@ -1,5 +1,5 @@
{
"model": "deepseek/deepseek-r1:free",
"model": "anthropic/claude-sonnet-4",
"messages": [
{
"role": "user",

View File

@ -24,7 +24,7 @@ export enum EqualityCheck {
NONE = 'none'
}
export type EqualityFn = (actual: string, expected: string) => Promise<boolean>;
export type EqualityFn = (actual: string, expected: string, prompt?: string) => Promise<boolean>;
export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
[EqualityCheck.DEFAULT]: async (actual: string, expected: string): Promise<boolean> => {
@ -40,6 +40,55 @@ export const EQUALITY_CHECKS: Record<string, EqualityFn> = {
return false;
}
},
[EqualityCheck.LLM_EQUAL]: async (actual: string, expected: string, prompt?: string): Promise<boolean> => {
try {
const model = getTestEqualModels()[0];
if (!model) {
return false;
}
const comparisonPrompt = `You are an assistant that judges if two AI responses are semantically equivalent for a given prompt.
The original prompt was: "${prompt}"
Response A:
"${actual}"
Response B (expected):
"${expected}"
Are these two responses semantically equivalent? Consider that minor differences in formatting (like commas, casing) or phrasing should be ignored as long as the meaning is the same. Dont comment, just return the JSON object.`;
const responseSchema = {
type: 'object',
properties: {
equivalent: {
type: 'boolean',
description: "Whether the two responses are semantically equivalent."
},
semantic_distance: {
type: 'number',
description: "The semantic distance between the two responses, on a scale of 0 to 100."
}
},
required: ['equivalent', 'semantic_distance']
};
const zodSchema = JSONSchemaToZod.convert(responseSchema);
const format = zodResponseFormat(zodSchema, 'equivalent');
const result = await run({
prompt: comparisonPrompt,
model,
mode: 'completion',
format
});
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
return false;
}
const parsedResult = JSON.parse(result[0] as string);
return parsedResult.equivalent === true;
} catch (e) {
return false;
}
},
[EqualityCheck.NONE]: async (): Promise<boolean> => {
return true;
},
@ -55,7 +104,7 @@ export const getFastModels = (): string[] => {
export const getTestEqualModels = (): string[] => {
return [
E_OPENROUTER_MODEL.MODEL_DEEPSEEK_DEEPSEEK_R1_FREE
E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O
]
}
@ -248,7 +297,7 @@ export const runTest = async (
)
]) as string[]
if (isEmptyResponse(result)) {
if (isEmptyResponse(Array.isArray(result) ? result.map(r => String(r)) : [])) {
testResult = {
test: testName,
prompt,
@ -265,7 +314,7 @@ export const runTest = async (
const actual = result?.[0] || ''
const equalityCheck = options.equalityCheck || EqualityCheck.DEFAULT
const checkFn = EQUALITY_CHECKS[equalityCheck] || EQUALITY_CHECKS[EqualityCheck.DEFAULT]
const passed = await checkFn(actual, expected)
const passed = await checkFn(actual, expected, prompt)
testResult = {
test: testName,

View File

@ -10,7 +10,8 @@ import {
TestResult,
runTest,
generateTestReport,
getReportPaths
getReportPaths,
EqualityCheck
} from './commons'
// Optionally override models for this specific test file
@ -27,6 +28,22 @@ describe('Language Operations', () => {
const TEST_LOG_PATH = getReportPaths('language', 'json')
const TEST_REPORT_PATH = getReportPaths('language', 'md')
it.each(models)('should summarize text with model %s', async (modelName) => {
const result = await runTest(
'Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.',
'A fox jumps over a dog',
'summarization',
modelName,
TEST_LOG_PATH,
'completion',
{ equalityCheck: EqualityCheck.LLM_EQUAL }
)
testResults.push(result)
expect(result.passed).toBe(true)
}, { timeout: TEST_TIMEOUT })
it.each(models)('should translate text with model %s', async (modelName) => {
const result = await runTest(
'Translate "Hello, world!" to Spanish. Return only the translation, no explanation.',
@ -51,18 +68,6 @@ describe('Language Operations', () => {
expect(normalize(result.result[0])).toEqual(normalize('i went to the store yesterday'))
}, { timeout: TEST_TIMEOUT })
it.each(models)('should summarize text with model %s', async (modelName) => {
const result = await runTest(
'Summarize: "The quick brown fox jumps over the dog". Return only the summary, compact, no explanation.',
'A fox jumps over a dog',
'summarization',
modelName,
TEST_LOG_PATH
)
testResults.push(result)
expect(normalize(result.result[0])).toEqual(normalize('a fox jumps over a dog'))
}, { timeout: TEST_TIMEOUT })
it.each(models)('should identify language with model %s', async (modelName) => {
const result = await runTest(
'Identify the language of: "Bonjour, comment allez-vous?". Return only the language name, no explanation.',

View File

@ -4757,6 +4757,330 @@
"passed": true,
"duration": 3836,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:41:10.838Z",
"passed": true,
"duration": 1465,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:41:11.649Z",
"passed": true,
"duration": 805,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:41:16.717Z",
"passed": true,
"duration": 5063,
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:41:17.368Z",
"passed": false,
"duration": 646,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:41:18.259Z",
"passed": false,
"duration": 886,
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:41:23.006Z",
"passed": false,
"duration": 4742,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A brown fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:41:32.126Z",
"passed": false,
"duration": 9115,
"reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A brown fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:44:29.781Z",
"passed": false,
"duration": 6689,
"reason": "Expected A fox jumps over a dog, but got A brown fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:45:17.372Z",
"passed": true,
"duration": 47581,
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:52:22.430Z",
"passed": true,
"duration": 27328,
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:52:30.984Z",
"passed": true,
"duration": 8548,
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [],
"expected": "A fox jumps over a dog",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:31.003Z",
"passed": false,
"duration": 60014,
"error": {
"message": "API call timed out",
"code": "UNKNOWN",
"type": "Error",
"details": {
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:296:33)\n at listOnTimeout (node:internal/timers:588:17)\n at processTimers (node:internal/timers:523:7)",
"message": "API call timed out"
}
},
"reason": "API call timed out",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:32.418Z",
"passed": true,
"duration": 1408,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:53:33.063Z",
"passed": true,
"duration": 639,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:35.309Z",
"passed": true,
"duration": 2241,
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:35.959Z",
"passed": false,
"duration": 645,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:53:36.606Z",
"passed": false,
"duration": 641,
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:41.701Z",
"passed": false,
"duration": 5090,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:42.454Z",
"passed": true,
"duration": 747,
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:53:43.116Z",
"passed": true,
"duration": 657,
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:47.420Z",
"passed": true,
"duration": 4299,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:48.762Z",
"passed": true,
"duration": 1336,
"category": "language"
}
],
"highscores": [
@ -4898,15 +5222,15 @@
{
"test": "translation",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 639,
"duration_secs": 0.639
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 818,
"duration_secs": 0.818
},
{
"model": "openai/gpt-4o-mini",
"duration": 1451,
"duration_secs": 1.451
}
]
},
@ -4920,8 +5244,8 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 695,
"duration_secs": 0.695
"duration": 641,
"duration_secs": 0.641
}
]
},
@ -4935,8 +5259,8 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 692,
"duration_secs": 0.692
"duration": 8548,
"duration_secs": 8.548
}
]
},
@ -4945,8 +5269,8 @@
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 459,
"duration_secs": 0.459
"duration": 657,
"duration_secs": 0.657
},
{
"model": "openai/gpt-3.5-turbo",
@ -5171,5 +5495,5 @@
]
}
],
"lastUpdated": "2025-06-05T22:31:00.927Z"
"lastUpdated": "2025-06-05T22:53:48.765Z"
}

View File

@ -1667,21 +1667,345 @@
"passed": true,
"duration": 3836,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:41:10.838Z",
"passed": true,
"duration": 1465,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:41:11.649Z",
"passed": true,
"duration": 805,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:41:16.717Z",
"passed": true,
"duration": 5063,
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:41:17.368Z",
"passed": false,
"duration": 646,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:41:18.259Z",
"passed": false,
"duration": 886,
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:41:23.006Z",
"passed": false,
"duration": 4742,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A brown fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:41:32.126Z",
"passed": false,
"duration": 9115,
"reason": "Expected A fox jumps over a dog, but got A brown fox leaps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A brown fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:44:29.781Z",
"passed": false,
"duration": 6689,
"reason": "Expected A fox jumps over a dog, but got A brown fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:45:17.372Z",
"passed": true,
"duration": 47581,
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:52:22.430Z",
"passed": true,
"duration": 27328,
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:52:30.984Z",
"passed": true,
"duration": 8548,
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [],
"expected": "A fox jumps over a dog",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:31.003Z",
"passed": false,
"duration": 60014,
"error": {
"message": "API call timed out",
"code": "UNKNOWN",
"type": "Error",
"details": {
"stack": "Error: API call timed out\n at Timeout._onTimeout (C:\\Users\\zx\\Desktop\\polymech\\polymech-mono\\packages\\kbot\\tests\\unit\\commons.ts:296:33)\n at listOnTimeout (node:internal/timers:588:17)\n at processTimers (node:internal/timers:523:7)",
"message": "API call timed out"
}
},
"reason": "API call timed out",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:32.418Z",
"passed": true,
"duration": 1408,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:53:33.063Z",
"passed": true,
"duration": 639,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:35.309Z",
"passed": true,
"duration": 2241,
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:35.959Z",
"passed": false,
"duration": 645,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:53:36.606Z",
"passed": false,
"duration": 641,
"reason": "Expected I went to the store yesterday, but got \"I went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:41.701Z",
"passed": false,
"duration": 5090,
"reason": "Expected I went to the store yesterday, but got I went to the store yesterday.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:42.454Z",
"passed": true,
"duration": 747,
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T22:53:43.116Z",
"passed": true,
"duration": 657,
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T22:53:47.420Z",
"passed": true,
"duration": 4299,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T22:53:48.762Z",
"passed": true,
"duration": 1336,
"category": "language"
}
],
"highscores": [
{
"test": "translation",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 639,
"duration_secs": 0.639
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 818,
"duration_secs": 0.818
},
{
"model": "openai/gpt-4o-mini",
"duration": 1451,
"duration_secs": 1.451
}
]
},
@ -1695,8 +2019,8 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 695,
"duration_secs": 0.695
"duration": 641,
"duration_secs": 0.641
}
]
},
@ -1710,8 +2034,8 @@
},
{
"model": "openai/gpt-4o-mini",
"duration": 692,
"duration_secs": 0.692
"duration": 8548,
"duration_secs": 8.548
}
]
},
@ -1720,8 +2044,8 @@
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 459,
"duration_secs": 0.459
"duration": 657,
"duration_secs": 0.657
},
{
"model": "openai/gpt-3.5-turbo",
@ -1746,5 +2070,5 @@
]
}
],
"lastUpdated": "2025-06-05T22:31:00.924Z"
"lastUpdated": "2025-06-05T22:53:48.763Z"
}