mono/packages/kbot/tests/unit/reports/language.json

1534 lines
51 KiB
JSON

{
"results": [
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T05:14:34.735Z",
"passed": false,
"duration": 942,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T05:14:35.415Z",
"passed": false,
"duration": 677,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T05:14:35.981Z",
"passed": false,
"duration": 564,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T05:14:36.670Z",
"passed": false,
"duration": 684,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
"result": [
"The quick brown fox jumps over the lazy dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T05:14:37.351Z",
"passed": false,
"duration": 678,
"reason": "Expected A fox jumps over a dog, but got the quick brown fox jumps over the lazy dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
"result": [
"A fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T05:14:38.015Z",
"passed": false,
"duration": 661,
"reason": "Expected A fox jumps over a dog, but got a fox leaps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T05:14:39.002Z",
"passed": false,
"duration": 985,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T05:14:39.575Z",
"passed": false,
"duration": 571,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T05:14:40.170Z",
"passed": true,
"duration": 594,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T05:14:40.790Z",
"passed": true,
"duration": 617,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:07.876Z",
"passed": false,
"duration": 1153,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:08.421Z",
"passed": false,
"duration": 540,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:08.967Z",
"passed": false,
"duration": 542,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:09.649Z",
"passed": false,
"duration": 677,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
"result": [
"The quick brown fox jumps over the lazy dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:10.358Z",
"passed": false,
"duration": 706,
"reason": "Expected A fox jumps over a dog, but got the quick brown fox jumps over the lazy dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:10.973Z",
"passed": false,
"duration": 612,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:11.549Z",
"passed": false,
"duration": 573,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:12.127Z",
"passed": false,
"duration": 575,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:46:13.050Z",
"passed": true,
"duration": 920,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:46:13.615Z",
"passed": true,
"duration": 562,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:50:29.662Z",
"passed": false,
"duration": 756,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:50:30.776Z",
"passed": false,
"duration": 1109,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:50:31.560Z",
"passed": false,
"duration": 781,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:50:32.159Z",
"passed": false,
"duration": 595,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
"result": [
"A quick fox jumps over a lazy dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:50:33.261Z",
"passed": false,
"duration": 1099,
"reason": "Expected A fox jumps over a dog, but got a quick fox jumps over a lazy dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
"result": [
"A fox jumps over a lazy dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:50:34.108Z",
"passed": false,
"duration": 840,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a lazy dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:50:34.652Z",
"passed": false,
"duration": 541,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:50:35.143Z",
"passed": false,
"duration": 487,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:50:35.656Z",
"passed": true,
"duration": 510,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:50:36.124Z",
"passed": true,
"duration": 465,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:25.151Z",
"passed": false,
"duration": 871,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:26.099Z",
"passed": false,
"duration": 943,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:26.669Z",
"passed": false,
"duration": 567,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:27.226Z",
"passed": false,
"duration": 554,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, compact, no explanation.",
"result": [
"The quick brown fox jumps over the lazy dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:28.019Z",
"passed": false,
"duration": 791,
"reason": "Expected A fox jumps over a dog, but got the quick brown fox jumps over the lazy dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, compact, no explanation.",
"result": [
"A quick fox jumps over a lazy dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:28.771Z",
"passed": false,
"duration": 746,
"reason": "Expected A fox jumps over a dog, but got a quick fox jumps over a lazy dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:29.352Z",
"passed": false,
"duration": 578,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:30.320Z",
"passed": false,
"duration": 966,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:30.862Z",
"passed": true,
"duration": 539,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:33.736Z",
"passed": true,
"duration": 2872,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:53.263Z",
"passed": false,
"duration": 831,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:53.884Z",
"passed": false,
"duration": 617,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:54.462Z",
"passed": false,
"duration": 575,
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:55.104Z",
"passed": false,
"duration": 639,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A quick brown fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:55.894Z",
"passed": false,
"duration": 787,
"reason": "Expected A fox jumps over a dog, but got a quick brown fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:51:56.484Z",
"passed": false,
"duration": 582,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:51:59.835Z",
"passed": false,
"duration": 3348,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:52:00.400Z",
"passed": false,
"duration": 562,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:52:01.616Z",
"passed": true,
"duration": 1214,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:52:02.542Z",
"passed": true,
"duration": 923,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:09.330Z",
"passed": false,
"duration": 844,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:53:10.262Z",
"passed": false,
"duration": 928,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:11.043Z",
"passed": false,
"duration": 779,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:53:11.724Z",
"passed": false,
"duration": 678,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:12.663Z",
"passed": false,
"duration": 937,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:53:13.482Z",
"passed": false,
"duration": 817,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:14.188Z",
"passed": false,
"duration": 704,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:53:14.748Z",
"passed": false,
"duration": 557,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:15.311Z",
"passed": true,
"duration": 559,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:53:15.852Z",
"passed": true,
"duration": 538,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:35.433Z",
"passed": false,
"duration": 941,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:53:36.309Z",
"passed": false,
"duration": 871,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:53:36.924Z",
"passed": false,
"duration": 612,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:54:06.162Z",
"passed": false,
"duration": 818,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:06.810Z",
"passed": false,
"duration": 642,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:54:07.390Z",
"passed": false,
"duration": 576,
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:08.237Z",
"passed": false,
"duration": 844,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"The quick brown fox jumps over the dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:54:08.852Z",
"passed": false,
"duration": 612,
"reason": "Expected A fox jumps over a dog, but got the quick brown fox jumps over the dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:09.559Z",
"passed": false,
"duration": 699,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:54:10.257Z",
"passed": false,
"duration": 695,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:10.757Z",
"passed": false,
"duration": 497,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-3.5-turbo",
"router": "openai/gpt-3.5-turbo",
"timestamp": "2025-06-05T18:54:11.331Z",
"passed": true,
"duration": 570,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:12.093Z",
"passed": true,
"duration": 760,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:54:43.059Z",
"passed": false,
"duration": 2067,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:43.754Z",
"passed": false,
"duration": 689,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:54:45.466Z",
"passed": false,
"duration": 1708,
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:46.074Z",
"passed": false,
"duration": 605,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A brown fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:54:48.340Z",
"passed": false,
"duration": 2263,
"reason": "Expected A fox jumps over a dog, but got a brown fox leaps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:49.025Z",
"passed": false,
"duration": 675,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:54:50.753Z",
"passed": false,
"duration": 1724,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:51.307Z",
"passed": false,
"duration": 551,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:54:53.244Z",
"passed": true,
"duration": 1934,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:54:53.740Z",
"passed": true,
"duration": 493,
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:55:31.636Z",
"passed": false,
"duration": 1317,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:55:32.306Z",
"passed": false,
"duration": 666,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "translation",
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
"result": [
"¡Hola, mundo!"
],
"expected": "¡Hola, mundo!",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:55:37.706Z",
"passed": false,
"duration": 5397,
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"I went to the store yesterday."
],
"expected": "I went to the store yesterday",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:55:39.433Z",
"passed": false,
"duration": 1722,
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:55:40.607Z",
"passed": false,
"duration": 1171,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "grammar",
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
"result": [
"\"I went to the store yesterday.\""
],
"expected": "I went to the store yesterday",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:55:45.810Z",
"passed": false,
"duration": 5199,
"reason": "Expected I went to the store yesterday, but got \"i went to the store yesterday.\"",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:55:47.634Z",
"passed": false,
"duration": 1820,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A fox jumps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:55:48.336Z",
"passed": false,
"duration": 699,
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
"category": "language"
},
{
"test": "summarization",
"prompt": "Summarize: \"The quick brown fox jumps over the dog\". Return only the summary, compact, no explanation.",
"result": [
"A quick brown fox leaps over a dog."
],
"expected": "A fox jumps over a dog",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:55:55.720Z",
"passed": false,
"duration": 7380,
"reason": "Expected A fox jumps over a dog, but got a quick brown fox leaps over a dog.",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:55:57.453Z",
"passed": false,
"duration": 1725,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:55:58.233Z",
"passed": false,
"duration": 776,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "language_detection",
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
"result": [
"French"
],
"expected": "French",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:56:03.483Z",
"passed": false,
"duration": 5247,
"reason": "Expected French, but got french",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Content"
],
"expected": "joyful",
"model": "anthropic/claude-sonnet-4",
"router": "anthropic/claude-sonnet-4",
"timestamp": "2025-06-05T18:56:05.453Z",
"passed": false,
"duration": 1967,
"reason": "Expected joyful, but got content",
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "openai/gpt-4o-mini",
"router": "openai/gpt-4o-mini",
"timestamp": "2025-06-05T18:56:06.005Z",
"passed": true,
"duration": 548,
"category": "language"
},
{
"test": "synonyms",
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
"result": [
"Joyful"
],
"expected": "joyful",
"model": "deepseek/deepseek-r1:free",
"router": "deepseek/deepseek-r1:free",
"timestamp": "2025-06-05T18:56:08.626Z",
"passed": true,
"duration": 2616,
"category": "language"
}
],
"highscores": [
{
"test": "translation",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 666,
"duration_secs": 0.666
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 818,
"duration_secs": 0.818
}
]
},
{
"test": "grammar",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 576,
"duration_secs": 0.576
},
{
"model": "openai/gpt-4o-mini",
"duration": 1171,
"duration_secs": 1.171
}
]
},
{
"test": "summarization",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 612,
"duration_secs": 0.612
},
{
"model": "openai/gpt-4o-mini",
"duration": 699,
"duration_secs": 0.699
}
]
},
{
"test": "language_detection",
"rankings": [
{
"model": "openai/gpt-3.5-turbo",
"duration": 695,
"duration_secs": 0.695
},
{
"model": "openai/gpt-4o-mini",
"duration": 776,
"duration_secs": 0.776
}
]
},
{
"test": "synonyms",
"rankings": [
{
"model": "openai/gpt-4o-mini",
"duration": 548,
"duration_secs": 0.548
},
{
"model": "openai/gpt-3.5-turbo",
"duration": 570,
"duration_secs": 0.57
}
]
}
],
"lastUpdated": "2025-06-05T18:56:08.627Z"
}