375 lines
12 KiB
JSON
375 lines
12 KiB
JSON
{
|
|
"results": [
|
|
{
|
|
"test": "translation",
|
|
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
|
"result": [
|
|
"¡Hola, mundo!"
|
|
],
|
|
"expected": "¡Hola, mundo!",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-06T15:42:04.594Z",
|
|
"passed": false,
|
|
"duration": 1183,
|
|
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "translation",
|
|
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
|
"result": [
|
|
"Hola, mundo!"
|
|
],
|
|
"expected": "¡Hola, mundo!",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-06T15:42:07.871Z",
|
|
"passed": false,
|
|
"duration": 3265,
|
|
"reason": "Expected ¡Hola, mundo!, but got hola, mundo!",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "translation",
|
|
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
|
"result": [
|
|
"¡Hola, mundo!"
|
|
],
|
|
"expected": "¡Hola, mundo!",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-06T15:42:09.128Z",
|
|
"passed": false,
|
|
"duration": 1244,
|
|
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "translation",
|
|
"prompt": "Translate \"Hello, world!\" to Spanish. Return only the translation, no explanation.",
|
|
"result": [
|
|
"¡Hola, mundo!"
|
|
],
|
|
"expected": "¡Hola, mundo!",
|
|
"model": "openrouter/quasar-alpha",
|
|
"router": "openrouter/quasar-alpha",
|
|
"timestamp": "2025-04-06T15:42:10.051Z",
|
|
"passed": false,
|
|
"duration": 914,
|
|
"reason": "Expected ¡Hola, mundo!, but got ¡hola, mundo!",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "grammar",
|
|
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
|
"result": [
|
|
"I went to the store yesterday."
|
|
],
|
|
"expected": "I went to the store yesterday",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-06T15:42:10.948Z",
|
|
"passed": false,
|
|
"duration": 886,
|
|
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "grammar",
|
|
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
|
"result": [
|
|
"I went to the store yesterday."
|
|
],
|
|
"expected": "I went to the store yesterday",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-06T15:42:15.782Z",
|
|
"passed": false,
|
|
"duration": 4822,
|
|
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "grammar",
|
|
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
|
"result": [
|
|
"I went to the store yesterday."
|
|
],
|
|
"expected": "I went to the store yesterday",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-06T15:42:16.691Z",
|
|
"passed": false,
|
|
"duration": 895,
|
|
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "grammar",
|
|
"prompt": "Correct the grammar in: \"I goes to the store yesterday\". Return only the corrected sentence, no explanation.",
|
|
"result": [
|
|
"I went to the store yesterday."
|
|
],
|
|
"expected": "I went to the store yesterday",
|
|
"model": "openrouter/quasar-alpha",
|
|
"router": "openrouter/quasar-alpha",
|
|
"timestamp": "2025-04-06T15:42:17.494Z",
|
|
"passed": false,
|
|
"duration": 789,
|
|
"reason": "Expected I went to the store yesterday, but got i went to the store yesterday.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "summarization",
|
|
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
|
|
"result": [
|
|
"A quick brown fox jumps over a lazy dog."
|
|
],
|
|
"expected": "A fox jumps over a dog",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-06T15:42:18.910Z",
|
|
"passed": false,
|
|
"duration": 1405,
|
|
"reason": "Expected A fox jumps over a dog, but got a quick brown fox jumps over a lazy dog.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "summarization",
|
|
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
|
|
"result": [
|
|
"\"The quick brown fox jumps over the lazy dog.\""
|
|
],
|
|
"expected": "A fox jumps over a dog",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-06T15:42:23.978Z",
|
|
"passed": false,
|
|
"duration": 5056,
|
|
"reason": "Expected A fox jumps over a dog, but got \"the quick brown fox jumps over the lazy dog.\"",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "summarization",
|
|
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
|
|
"result": [
|
|
"A fox jumps over a dog."
|
|
],
|
|
"expected": "A fox jumps over a dog",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-06T15:42:24.931Z",
|
|
"passed": false,
|
|
"duration": 942,
|
|
"reason": "Expected A fox jumps over a dog, but got a fox jumps over a dog.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "summarization",
|
|
"prompt": "Summarize: \"The quick brown fox jumps over the lazy dog\". Return only the summary, no explanation.",
|
|
"result": [
|
|
"A fox leaps over a dog."
|
|
],
|
|
"expected": "A fox jumps over a dog",
|
|
"model": "openrouter/quasar-alpha",
|
|
"router": "openrouter/quasar-alpha",
|
|
"timestamp": "2025-04-06T15:42:25.886Z",
|
|
"passed": false,
|
|
"duration": 944,
|
|
"reason": "Expected A fox jumps over a dog, but got a fox leaps over a dog.",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "language_detection",
|
|
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
|
"result": [
|
|
"French"
|
|
],
|
|
"expected": "French",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-06T15:42:26.837Z",
|
|
"passed": false,
|
|
"duration": 939,
|
|
"reason": "Expected French, but got french",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "language_detection",
|
|
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
|
"result": [],
|
|
"expected": "French",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-06T15:42:27.292Z",
|
|
"passed": false,
|
|
"duration": 442,
|
|
"reason": "Model returned empty response",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "language_detection",
|
|
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
|
"result": [
|
|
"French"
|
|
],
|
|
"expected": "French",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-06T15:42:28.460Z",
|
|
"passed": false,
|
|
"duration": 1152,
|
|
"reason": "Expected French, but got french",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "language_detection",
|
|
"prompt": "Identify the language of: \"Bonjour, comment allez-vous?\". Return only the language name, no explanation.",
|
|
"result": [
|
|
"French"
|
|
],
|
|
"expected": "French",
|
|
"model": "openrouter/quasar-alpha",
|
|
"router": "openrouter/quasar-alpha",
|
|
"timestamp": "2025-04-06T15:42:29.493Z",
|
|
"passed": false,
|
|
"duration": 1022,
|
|
"reason": "Expected French, but got french",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "synonyms",
|
|
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
|
"result": [
|
|
"Joyful"
|
|
],
|
|
"expected": "joyful",
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"router": "openai/gpt-3.5-turbo",
|
|
"timestamp": "2025-04-06T15:42:30.442Z",
|
|
"passed": true,
|
|
"duration": 938,
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "synonyms",
|
|
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
|
"result": [],
|
|
"expected": "joyful",
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"router": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"timestamp": "2025-04-06T15:42:30.888Z",
|
|
"passed": false,
|
|
"duration": 436,
|
|
"reason": "Model returned empty response",
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "synonyms",
|
|
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
|
"result": [
|
|
"Joyful"
|
|
],
|
|
"expected": "joyful",
|
|
"model": "openai/gpt-4o-mini",
|
|
"router": "openai/gpt-4o-mini",
|
|
"timestamp": "2025-04-06T15:42:31.838Z",
|
|
"passed": true,
|
|
"duration": 947,
|
|
"category": "language"
|
|
},
|
|
{
|
|
"test": "synonyms",
|
|
"prompt": "Provide a synonym for \"happy\". Return only the synonym, no explanation.",
|
|
"result": [
|
|
"Joyful"
|
|
],
|
|
"expected": "joyful",
|
|
"model": "openrouter/quasar-alpha",
|
|
"router": "openrouter/quasar-alpha",
|
|
"timestamp": "2025-04-06T15:42:32.705Z",
|
|
"passed": true,
|
|
"duration": 857,
|
|
"category": "language"
|
|
}
|
|
],
|
|
"highscores": [
|
|
{
|
|
"test": "translation",
|
|
"rankings": [
|
|
{
|
|
"model": "openrouter/quasar-alpha",
|
|
"duration": 914,
|
|
"duration_secs": 0.914
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 1183,
|
|
"duration_secs": 1.183
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "grammar",
|
|
"rankings": [
|
|
{
|
|
"model": "openrouter/quasar-alpha",
|
|
"duration": 789,
|
|
"duration_secs": 0.789
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 886,
|
|
"duration_secs": 0.886
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "summarization",
|
|
"rankings": [
|
|
{
|
|
"model": "openai/gpt-4o-mini",
|
|
"duration": 942,
|
|
"duration_secs": 0.942
|
|
},
|
|
{
|
|
"model": "openrouter/quasar-alpha",
|
|
"duration": 944,
|
|
"duration_secs": 0.944
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "language_detection",
|
|
"rankings": [
|
|
{
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"duration": 442,
|
|
"duration_secs": 0.442
|
|
},
|
|
{
|
|
"model": "openai/gpt-3.5-turbo",
|
|
"duration": 939,
|
|
"duration_secs": 0.939
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"test": "synonyms",
|
|
"rankings": [
|
|
{
|
|
"model": "deepseek/deepseek-r1-distill-qwen-14b:free",
|
|
"duration": 436,
|
|
"duration_secs": 0.436
|
|
},
|
|
{
|
|
"model": "openrouter/quasar-alpha",
|
|
"duration": 857,
|
|
"duration_secs": 0.857
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"lastUpdated": "2025-04-06T15:42:32.705Z"
|
|
} |