From 0bc4969f88a30b7fe4092f75af4e3585e1013f10 Mon Sep 17 00:00:00 2001 From: amirrr <6696894+amirrr@users.noreply.github.com> Date: Mon, 11 May 2026 21:43:47 +0200 Subject: [PATCH 1/2] add llm eval page --- client/public/locales/ar/translation.json | 51 ++- client/public/locales/bn/translation.json | 51 ++- client/public/locales/en/translation.json | 51 ++- client/public/locales/es/translation.json | 51 ++- client/public/locales/fr/translation.json | 51 ++- client/public/locales/hi/translation.json | 51 ++- client/public/locales/ja/translation.json | 51 ++- client/public/locales/pt/translation.json | 51 ++- client/public/locales/ru/translation.json | 51 ++- client/public/locales/zh/translation.json | 51 ++- client/src/App.tsx | 3 +- client/src/data/llmEvals.json | 390 ++++++++++++++++++ client/src/pages/LlmEvals.tsx | 481 ++++++++++++++++++++++ client/src/partials/NavBar.tsx | 10 +- 14 files changed, 1382 insertions(+), 12 deletions(-) create mode 100644 client/src/data/llmEvals.json create mode 100644 client/src/pages/LlmEvals.tsx diff --git a/client/public/locales/ar/translation.json b/client/public/locales/ar/translation.json index 2cee456..87bc15e 100644 --- a/client/public/locales/ar/translation.json +++ b/client/public/locales/ar/translation.json @@ -98,7 +98,8 @@ "dashboard": "لوحة القيادة", "people": "الأشخاص", "research": "البحث", - "signin": "تسجيل الدخول" + "signin": "تسجيل الدخول", + "llmEvals": "تقييمات النماذج اللغوية" }, "navbar.participate →": "شارك →", "publications": { @@ -339,5 +340,53 @@ "question": "يزن كلب وقطة 100 رطل في المجموع. يزن الكلب 86 رطلاً. ما هو الفرق في الوزن بين الكلب والقطة؟", "intuitiveExplanation": "86 رطلاً (وزن الكلب) يبدو بديهياً" } + }, + "llmEvals": { + "eyebrow": "معيار الحس العام", + "title": "تقييمات الحس العام للنماذج اللغوية الكبيرة", + "intro": "يقارن هذا الجدول نماذج اللغة وفق ثلاثة مقاييس للحس العام على مستوى الفرد: التوافق والوعي والمنطقية العامة.", + "methodology": "يقيس التوافق ما اذا كان النموذج يتفق مع الاغلبية البشرية لكل عبارة. ويقيس الوعي ما اذا كان يتنبأ بشكل صحيح بما ستعتقده اغلبية الناس. وتجمع المنطقية العامة بين هاتين الاشارتين.", + "detailsPrefix": "لمزيد من التفاصيل حول اطار التقييم والنتائج، زر", + "detailsLink": "صفحة عرض الحس العام للنماذج اللغوية الكبيرة", + "detailsSuffix": ".", + "modelsEvaluated": "النماذج المقيمة", + "topCommonsensicalityII": "اعلى منطقية عامة (II)", + "scoreRank": "درجة {{score}}، الترتيب {{rank}}", + "lastUpdated": "اخر تحديث", + "tableTitle": "نتائج التقييم", + "tableHelp": "ابحث حسب النموذج، او رشح حسب حد ادنى لدرجة المقياس، او انقر على اي عنوان عمود للفرز.", + "modelFilter": "النموذج", + "searchPlaceholder": "البحث في النماذج", + "scoreMetric": "مقياس الدرجة", + "minimumScore": "الحد الادنى للدرجة", + "anyPlaceholder": "اي قيمة", + "caption": "درجات تقييم الحس العام للنماذج اللغوية الكبيرة", + "showing": "عرض {{shown}} من اصل {{total}} نموذج", + "scoreNote": "تعرض الدرجة كقيمة مع الترتيب بين قوسين.", + "columns": { + "model": "النموذج", + "added": "تاريخ الاضافة", + "consensusI": "التوافق (I)", + "consensusII": "التوافق (II)", + "awarenessI": "الوعي (I)", + "awarenessII": "الوعي (II)", + "commonsensicalityI": "المنطقية العامة (I)", + "commonsensicalityII": "المنطقية العامة (II)" + }, + "variants": "تعرض كل درجة وفق صيغتين. في الصيغة (I)، تحدد الاغلبية البشرية من تقييمات البشر فقط. في الصيغة (II)، يعاد حساب الاغلبية بعد ادخال تقييم النموذج. الدرجات هي نسب مئوية، والارقام بين قوسين تعرض ترتيب كل نموذج داخل ذلك العمود.", + "modelDetailsPrefix": "لمقارنات اضافية بين النماذج، زر", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "للاطلاع على المنهجية والصيغ، راجع", + "methodDetailsSuffix": ".", + "paperLink": "الورقة البحثية", + "calculationTitle": "ملاحظات الحساب", + "calculationIntro": "يستخدم الجدول تعريفات تقييم الاغلبية التالية للصيغتين.", + "variantIFormulaLabel": "الصيغة (I): اغلبية بشرية فقط", + "variantIIFormulaLabel": "الصيغة (II): اغلبية تشمل تقييم النموذج", + "humanShareDefinition": "نسبة المقيمين البشر الذين يوافقون على العبارة i.", + "omegaDefinition": "مجموعة المقيمين البشر المخصصين للعبارة i.", + "alphaDefinition": "تقييم النموذج للعبارة i.", + "ratingsDefinition": "تقييم المقيم البشري j للعبارة i.", + "indicatorDefinition": "دالة مؤشر تعيد 1 عندما يكون الشرط صحيحا و0 خلاف ذلك." } } diff --git a/client/public/locales/bn/translation.json b/client/public/locales/bn/translation.json index 6c4b094..a62baa8 100644 --- a/client/public/locales/bn/translation.json +++ b/client/public/locales/bn/translation.json @@ -98,7 +98,8 @@ "dashboard": "ড্যাশবোর্ড", "people": "মানুষ", "research": "গবেষণা", - "signin": "প্রবেশ করুন" + "signin": "প্রবেশ করুন", + "llmEvals": "এলএলএম মূল্যায়ন" }, "navbar.participate →": "অংশগ্রহণ করুন →", "publications": { @@ -339,5 +340,53 @@ "question": "একটি কুকুর এবং একটি বিড়ালের মোট ওজন ১০০ পাউন্ড। কুকুরের ওজন ৮৬ পাউন্ড। কুকুর এবং বিড়ালের ওজনের পার্থক্য কত?", "intuitiveExplanation": "৮৬ পাউন্ড (কুকুরের ওজন) স্পষ্ট মনে হয়" } + }, + "llmEvals": { + "eyebrow": "সাধারণ বোধের বেঞ্চমার্ক", + "title": "এলএলএম সাধারণ বোধ মূল্যায়ন", + "intro": "এই টেবিলটি ভাষা মডেলগুলোকে তিনটি ব্যক্তি-স্তরের সাধারণ বোধ মাপে তুলনা করে: consensus, awareness এবং commonsensicality।", + "methodology": "Consensus মাপে একটি মডেল প্রতিটি বিবৃতিতে মানব majority-এর সঙ্গে একমত কিনা। Awareness মাপে মডেলটি অধিকাংশ মানুষ কী ভাববে তা সঠিকভাবে অনুমান করতে পারে কিনা। Commonsensicality এই দুই সংকেতকে একত্র করে।", + "detailsPrefix": "মূল্যায়ন কাঠামো এবং ফলাফল সম্পর্কে আরও বিস্তারিত জানতে দেখুন", + "detailsLink": "এলএলএম সাধারণ বোধ ডেমো পেজ", + "detailsSuffix": ".", + "modelsEvaluated": "মূল্যায়িত মডেল", + "topCommonsensicalityII": "সর্বোচ্চ commonsensicality (II)", + "scoreRank": "{{score}} স্কোর, rank {{rank}}", + "lastUpdated": "সর্বশেষ আপডেট", + "tableTitle": "মূল্যায়নের ফলাফল", + "tableHelp": "মডেল দিয়ে খুঁজুন, ন্যূনতম metric score দিয়ে ফিল্টার করুন, অথবা সাজাতে যেকোনো কলাম শিরোনামে ক্লিক করুন।", + "modelFilter": "মডেল", + "searchPlaceholder": "মডেল খুঁজুন", + "scoreMetric": "স্কোর মেট্রিক", + "minimumScore": "ন্যূনতম স্কোর", + "anyPlaceholder": "যেকোনো", + "caption": "এলএলএম সাধারণ বোধ মূল্যায়নের স্কোর", + "showing": "{{total}} মডেলের মধ্যে {{shown}}টি দেখানো হচ্ছে", + "scoreNote": "স্কোর মান হিসেবে দেখানো হয়েছে, আর rank বন্ধনীর মধ্যে।", + "columns": { + "model": "মডেল", + "added": "যোগ করা হয়েছে", + "consensusI": "Consensus (I)", + "consensusII": "Consensus (II)", + "awarenessI": "Awareness (I)", + "awarenessII": "Awareness (II)", + "commonsensicalityI": "Commonsensicality (I)", + "commonsensicalityII": "Commonsensicality (II)" + }, + "variants": "প্রতিটি স্কোর দুটি variant-এ দেওয়া হয়েছে। Variant (I)-এ মানব majority শুধু মানব rating থেকে নির্ধারিত হয়। Variant (II)-এ মডেলের rating অন্তর্ভুক্ত করার পর majority পুনরায় গণনা করা হয়। স্কোরগুলো শতাংশ, এবং বন্ধনীর সংখ্যাগুলো সেই কলামের মধ্যে প্রতিটি মডেলের ranking দেখায়।", + "modelDetailsPrefix": "অতিরিক্ত মডেল তুলনার জন্য দেখুন", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "পদ্ধতি এবং formula-এর জন্য দেখুন", + "methodDetailsSuffix": ".", + "paperLink": "গবেষণা প্রবন্ধ", + "calculationTitle": "গণনার নোট", + "calculationIntro": "দুটি variant-এর জন্য টেবিলটি নিচের majority-rating সংজ্ঞা ব্যবহার করে।", + "variantIFormulaLabel": "Variant (I): শুধু মানব majority", + "variantIIFormulaLabel": "Variant (II): মডেল rating সহ majority", + "humanShareDefinition": "বিবৃতি i-এর সঙ্গে একমত মানব rater-এর অংশ।", + "omegaDefinition": "বিবৃতি i-তে নিযুক্ত মানব rater-দের সেট।", + "alphaDefinition": "বিবৃতি i-এর জন্য মডেলের rating।", + "ratingsDefinition": "বিবৃতি i-এর জন্য মানব rater j-এর rating।", + "indicatorDefinition": "একটি indicator function, যা শর্ত সত্য হলে 1 এবং অন্যথায় 0 ফেরত দেয়।" } } diff --git a/client/public/locales/en/translation.json b/client/public/locales/en/translation.json index 9d02435..4d9d98d 100644 --- a/client/public/locales/en/translation.json +++ b/client/public/locales/en/translation.json @@ -100,7 +100,8 @@ "dashboard": "Dashboard", "people": "People", "research": "Research", - "signin": "Signin" + "signin": "Signin", + "llmEvals": "LLM Evals" }, "navbar.participate →": "Participate →", "publications": { @@ -341,5 +342,53 @@ "question": "A dog and a cat weigh 100 pounds in total. The dog weighs 86 pounds. What is the difference in weight between the dog and the cat?", "intuitiveExplanation": "86 lbs (the dog's weight) feels obvious" } + }, + "llmEvals": { + "eyebrow": "Common sense benchmark", + "title": "LLM Commonsense Evals", + "intro": "This table compares language models on three individual-level common sense measures: consensus, awareness, and commonsensicality.", + "methodology": "Consensus measures whether a model agrees with the human majority for each statement. Awareness measures whether it correctly predicts what most people would think. Commonsensicality combines these two signals.", + "detailsPrefix": "For more details on the evaluation framework and results, visit", + "detailsLink": "LLM commonsense demo page", + "detailsSuffix": ".", + "modelsEvaluated": "Models evaluated", + "topCommonsensicalityII": "Top commonsensicality (II)", + "scoreRank": "{{score}} score, rank {{rank}}", + "lastUpdated": "Last updated", + "tableTitle": "Evaluation Results", + "tableHelp": "Search by model, filter by a minimum metric score, or click any column heading to sort.", + "modelFilter": "Model", + "searchPlaceholder": "Search models", + "scoreMetric": "Score metric", + "minimumScore": "Minimum score", + "anyPlaceholder": "Any", + "caption": "LLM commonsense evaluation scores", + "showing": "Showing {{shown}} of {{total}} models", + "scoreNote": "Score shown as value with rank in parentheses.", + "columns": { + "model": "Model", + "added": "Added", + "consensusI": "Consensus (I)", + "consensusII": "Consensus (II)", + "awarenessI": "Awareness (I)", + "awarenessII": "Awareness (II)", + "commonsensicalityI": "Commonsensicality (I)", + "commonsensicalityII": "Commonsensicality (II)" + }, + "variants": "Each score is reported under two variants. In variant (I), the human majority is determined from human ratings only. In variant (II), the majority is recalculated after including the model rating. Scores are percentages, and numbers in parentheses show each model ranking within that column.", + "modelDetailsPrefix": "For additional model comparisons, visit", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "For methodology and formulas, see", + "methodDetailsSuffix": ".", + "paperLink": "the research paper", + "calculationTitle": "Calculation notes", + "calculationIntro": "The table uses the following majority-rating definitions for the two variants.", + "variantIFormulaLabel": "Variant (I): human-only majority", + "variantIIFormulaLabel": "Variant (II): majority including the model rating", + "humanShareDefinition": "The share of human raters who agree with statement i.", + "omegaDefinition": "The set of human raters assigned to statement i.", + "alphaDefinition": "The model rating for statement i.", + "ratingsDefinition": "The rating from human rater j for statement i.", + "indicatorDefinition": "An indicator function that returns 1 when the condition is true and 0 otherwise." } } diff --git a/client/public/locales/es/translation.json b/client/public/locales/es/translation.json index 1d6f546..3a0bf35 100644 --- a/client/public/locales/es/translation.json +++ b/client/public/locales/es/translation.json @@ -98,7 +98,8 @@ "dashboard": "Tablero", "people": "Gente", "research": "Investigación", - "signin": "Iniciar sesión" + "signin": "Iniciar sesión", + "llmEvals": "Evaluaciones LLM" }, "navbar.participate →": "Participar →", "publications": { @@ -345,5 +346,53 @@ "question": "Un perro y un gato pesan 100 libras en total. El perro pesa 86 libras. ¿Cuál es la diferencia de peso entre el perro y el gato?", "intuitiveExplanation": "86 lbs (el peso del perro) parece obvio" } + }, + "llmEvals": { + "eyebrow": "Benchmark de sentido comun", + "title": "Evaluaciones de sentido comun de LLM", + "intro": "Esta tabla compara modelos de lenguaje en tres medidas de sentido comun a nivel individual: consenso, conciencia y commonsensicalidad.", + "methodology": "El consenso mide si un modelo coincide con la mayoria humana para cada enunciado. La conciencia mide si predice correctamente lo que pensaria la mayoria de las personas. La commonsensicalidad combina estas dos senales.", + "detailsPrefix": "Para mas detalles sobre el marco de evaluacion y los resultados, visita", + "detailsLink": "la pagina de demostracion de sentido comun de LLM", + "detailsSuffix": ".", + "modelsEvaluated": "Modelos evaluados", + "topCommonsensicalityII": "Mayor commonsensicalidad (II)", + "scoreRank": "puntuacion {{score}}, rango {{rank}}", + "lastUpdated": "Ultima actualizacion", + "tableTitle": "Resultados de evaluacion", + "tableHelp": "Busca por modelo, filtra por una puntuacion minima de metrica o haz clic en cualquier encabezado de columna para ordenar.", + "modelFilter": "Modelo", + "searchPlaceholder": "Buscar modelos", + "scoreMetric": "Metrica de puntuacion", + "minimumScore": "Puntuacion minima", + "anyPlaceholder": "Cualquiera", + "caption": "Puntuaciones de evaluacion de sentido comun de LLM", + "showing": "Mostrando {{shown}} de {{total}} modelos", + "scoreNote": "La puntuacion se muestra como valor con el rango entre parentesis.", + "columns": { + "model": "Modelo", + "added": "Agregado", + "consensusI": "Consenso (I)", + "consensusII": "Consenso (II)", + "awarenessI": "Conciencia (I)", + "awarenessII": "Conciencia (II)", + "commonsensicalityI": "Commonsensicalidad (I)", + "commonsensicalityII": "Commonsensicalidad (II)" + }, + "variants": "Cada puntuacion se presenta bajo dos variantes. En la variante (I), la mayoria humana se determina solo a partir de las calificaciones humanas. En la variante (II), la mayoria se recalcula despues de incluir la calificacion del modelo. Las puntuaciones son porcentajes, y los numeros entre parentesis muestran la clasificacion de cada modelo dentro de esa columna.", + "modelDetailsPrefix": "Para comparaciones adicionales de modelos, visita", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "Para metodologia y formulas, consulta", + "methodDetailsSuffix": ".", + "paperLink": "el articulo de investigacion", + "calculationTitle": "Notas de calculo", + "calculationIntro": "La tabla usa las siguientes definiciones de calificacion mayoritaria para las dos variantes.", + "variantIFormulaLabel": "Variante (I): mayoria solo humana", + "variantIIFormulaLabel": "Variante (II): mayoria que incluye la calificacion del modelo", + "humanShareDefinition": "La proporcion de evaluadores humanos que estan de acuerdo con el enunciado i.", + "omegaDefinition": "El conjunto de evaluadores humanos asignados al enunciado i.", + "alphaDefinition": "La calificacion del modelo para el enunciado i.", + "ratingsDefinition": "La calificacion del evaluador humano j para el enunciado i.", + "indicatorDefinition": "Una funcion indicadora que devuelve 1 cuando la condicion es verdadera y 0 en caso contrario." } } diff --git a/client/public/locales/fr/translation.json b/client/public/locales/fr/translation.json index 6b6fba5..9cd85d8 100644 --- a/client/public/locales/fr/translation.json +++ b/client/public/locales/fr/translation.json @@ -98,7 +98,8 @@ "dashboard": "Tableau de bord", "people": "Personnes", "research": "Recherche", - "signin": "Se connecter" + "signin": "Se connecter", + "llmEvals": "Evaluations LLM" }, "navbar.participate →": "Participer →", "publications": { @@ -339,5 +340,53 @@ "question": "Un chien et un chat pèsent 100 livres au total. Le chien pèse 86 livres. Quelle est la différence de poids entre le chien et le chat ?", "intuitiveExplanation": "86 lbs (le poids du chien) semble évident" } + }, + "llmEvals": { + "eyebrow": "Benchmark de sens commun", + "title": "Evaluations du sens commun des LLM", + "intro": "Ce tableau compare les modeles de langage selon trois mesures individuelles du sens commun : consensus, conscience et commonsensicalite.", + "methodology": "Le consensus mesure si un modele est d accord avec la majorite humaine pour chaque enonce. La conscience mesure s il predit correctement ce que la plupart des personnes penseraient. La commonsensicalite combine ces deux signaux.", + "detailsPrefix": "Pour plus de details sur le cadre d evaluation et les resultats, consultez", + "detailsLink": "la page de demonstration du sens commun des LLM", + "detailsSuffix": ".", + "modelsEvaluated": "Modeles evalues", + "topCommonsensicalityII": "Meilleure commonsensicalite (II)", + "scoreRank": "score {{score}}, rang {{rank}}", + "lastUpdated": "Derniere mise a jour", + "tableTitle": "Resultats de l evaluation", + "tableHelp": "Recherchez par modele, filtrez par score minimal de metrique ou cliquez sur un en-tete de colonne pour trier.", + "modelFilter": "Modele", + "searchPlaceholder": "Rechercher des modeles", + "scoreMetric": "Metrique de score", + "minimumScore": "Score minimal", + "anyPlaceholder": "Tous", + "caption": "Scores d evaluation du sens commun des LLM", + "showing": "{{shown}} modeles affiches sur {{total}}", + "scoreNote": "Le score est affiche sous forme de valeur avec le rang entre parentheses.", + "columns": { + "model": "Modele", + "added": "Ajoute", + "consensusI": "Consensus (I)", + "consensusII": "Consensus (II)", + "awarenessI": "Conscience (I)", + "awarenessII": "Conscience (II)", + "commonsensicalityI": "Commonsensicalite (I)", + "commonsensicalityII": "Commonsensicalite (II)" + }, + "variants": "Chaque score est rapporte selon deux variantes. Dans la variante (I), la majorite humaine est determinee uniquement a partir des evaluations humaines. Dans la variante (II), la majorite est recalculee apres inclusion de l evaluation du modele. Les scores sont des pourcentages, et les nombres entre parentheses indiquent le rang de chaque modele dans cette colonne.", + "modelDetailsPrefix": "Pour des comparaisons supplementaires entre modeles, consultez", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "Pour la methodologie et les formules, consultez", + "methodDetailsSuffix": ".", + "paperLink": "l article de recherche", + "calculationTitle": "Notes de calcul", + "calculationIntro": "Le tableau utilise les definitions suivantes de la note majoritaire pour les deux variantes.", + "variantIFormulaLabel": "Variante (I) : majorite humaine uniquement", + "variantIIFormulaLabel": "Variante (II) : majorite incluant l evaluation du modele", + "humanShareDefinition": "La part des evaluateurs humains qui sont d accord avec l enonce i.", + "omegaDefinition": "L ensemble des evaluateurs humains assignes a l enonce i.", + "alphaDefinition": "L evaluation du modele pour l enonce i.", + "ratingsDefinition": "L evaluation de l evaluateur humain j pour l enonce i.", + "indicatorDefinition": "Une fonction indicatrice qui renvoie 1 lorsque la condition est vraie et 0 sinon." } } diff --git a/client/public/locales/hi/translation.json b/client/public/locales/hi/translation.json index e6724b9..d2feb74 100644 --- a/client/public/locales/hi/translation.json +++ b/client/public/locales/hi/translation.json @@ -98,7 +98,8 @@ "dashboard": "डैशबोर्ड", "people": "लोग", "research": "शोध", - "signin": "साइन इन करें" + "signin": "साइन इन करें", + "llmEvals": "एलएलएम मूल्यांकन" }, "navbar.participate →": "भाग लें →", "publications": { @@ -339,5 +340,53 @@ "question": "एक कुत्ते और एक बिल्ली का कुल वजन 100 पाउंड है। कुत्ते का वजन 86 पाउंड है। कुत्ते और बिल्ली के वजन में कितना अंतर है?", "intuitiveExplanation": "86 पाउंड (कुत्ते का वजन) स्पष्ट लगता है" } + }, + "llmEvals": { + "eyebrow": "सामान्य-बोध बेंचमार्क", + "title": "एलएलएम सामान्य-बोध मूल्यांकन", + "intro": "यह तालिका भाषा मॉडलों की तीन व्यक्तिगत-स्तर की सामान्य-बोध मापों पर तुलना करती है: consensus, awareness और commonsensicality।", + "methodology": "Consensus मापता है कि कोई मॉडल हर कथन पर मानव बहुमत से सहमत है या नहीं। Awareness मापता है कि वह सही अनुमान लगाता है या नहीं कि अधिकतर लोग क्या सोचेंगे। Commonsensicality इन दोनों संकेतों को मिलाती है।", + "detailsPrefix": "मूल्यांकन framework और results के बारे में अधिक जानकारी के लिए देखें", + "detailsLink": "एलएलएम सामान्य-बोध डेमो पेज", + "detailsSuffix": ".", + "modelsEvaluated": "मूल्यांकित मॉडल", + "topCommonsensicalityII": "शीर्ष commonsensicality (II)", + "scoreRank": "{{score}} score, rank {{rank}}", + "lastUpdated": "अंतिम अपडेट", + "tableTitle": "मूल्यांकन परिणाम", + "tableHelp": "मॉडल से खोजें, न्यूनतम metric score से filter करें, या sort करने के लिए किसी भी column heading पर click करें।", + "modelFilter": "मॉडल", + "searchPlaceholder": "मॉडल खोजें", + "scoreMetric": "Score metric", + "minimumScore": "न्यूनतम score", + "anyPlaceholder": "कोई भी", + "caption": "एलएलएम सामान्य-बोध मूल्यांकन scores", + "showing": "{{total}} मॉडलों में से {{shown}} दिखाए जा रहे हैं", + "scoreNote": "Score को value के रूप में दिखाया गया है और rank को कोष्ठक में।", + "columns": { + "model": "मॉडल", + "added": "जोड़ा गया", + "consensusI": "Consensus (I)", + "consensusII": "Consensus (II)", + "awarenessI": "Awareness (I)", + "awarenessII": "Awareness (II)", + "commonsensicalityI": "Commonsensicality (I)", + "commonsensicalityII": "Commonsensicality (II)" + }, + "variants": "हर score दो variants में report किया गया है। Variant (I) में मानव majority केवल मानव ratings से निर्धारित होती है। Variant (II) में model rating को शामिल करने के बाद majority फिर से calculate होती है। Scores percentage हैं, और parentheses में दिए नंबर उस column में हर model की ranking दिखाते हैं।", + "modelDetailsPrefix": "अतिरिक्त model comparisons के लिए देखें", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "Methodology और formulas के लिए देखें", + "methodDetailsSuffix": ".", + "paperLink": "research paper", + "calculationTitle": "Calculation notes", + "calculationIntro": "तालिका दोनों variants के लिए नीचे दी गई majority-rating definitions का उपयोग करती है।", + "variantIFormulaLabel": "Variant (I): केवल मानव majority", + "variantIIFormulaLabel": "Variant (II): model rating सहित majority", + "humanShareDefinition": "कथन i से सहमत मानव raters का share।", + "omegaDefinition": "कथन i को assigned मानव raters का set।", + "alphaDefinition": "कथन i के लिए model rating।", + "ratingsDefinition": "कथन i के लिए मानव rater j की rating।", + "indicatorDefinition": "एक indicator function जो condition true होने पर 1 और अन्यथा 0 लौटाता है।" } } diff --git a/client/public/locales/ja/translation.json b/client/public/locales/ja/translation.json index 5f83efa..c446b6a 100644 --- a/client/public/locales/ja/translation.json +++ b/client/public/locales/ja/translation.json @@ -98,7 +98,8 @@ "dashboard": "ダッシュボード", "people": "人々", "research": "研究", - "signin": "サインイン" + "signin": "サインイン", + "llmEvals": "LLM 評価" }, "navbar.participate →": "参加する →", "publications": { @@ -339,5 +340,53 @@ "question": "犬と猫の合計体重は100ポンドです。犬の体重は86ポンドです。犬と猫の体重差はいくらですか?", "intuitiveExplanation": "86ポンド(犬の体重) 正解に見えてしまいます" } + }, + "llmEvals": { + "eyebrow": "常識ベンチマーク", + "title": "LLM 常識評価", + "intro": "この表は、合意、認識、常識性という3つの個体レベルの常識指標で言語モデルを比較します。", + "methodology": "合意は、各文についてモデルが人間の多数派と一致するかを測ります。認識は、多くの人がどう考えるかをモデルが正しく予測できるかを測ります。常識性は、この2つの信号を組み合わせたものです。", + "detailsPrefix": "評価フレームワークと結果の詳細については、", + "detailsLink": "LLM 常識デモページ", + "detailsSuffix": "をご覧ください。", + "modelsEvaluated": "評価済みモデル", + "topCommonsensicalityII": "最高の常識性 (II)", + "scoreRank": "スコア {{score}}、順位 {{rank}}", + "lastUpdated": "最終更新", + "tableTitle": "評価結果", + "tableHelp": "モデルで検索し、最小メトリックスコアで絞り込み、または任意の列見出しをクリックして並べ替えます。", + "modelFilter": "モデル", + "searchPlaceholder": "モデルを検索", + "scoreMetric": "スコア指標", + "minimumScore": "最小スコア", + "anyPlaceholder": "指定なし", + "caption": "LLM 常識評価スコア", + "showing": "{{total}} 件中 {{shown}} 件のモデルを表示", + "scoreNote": "スコアは値として表示され、順位は括弧内に示されます。", + "columns": { + "model": "モデル", + "added": "追加日", + "consensusI": "合意 (I)", + "consensusII": "合意 (II)", + "awarenessI": "認識 (I)", + "awarenessII": "認識 (II)", + "commonsensicalityI": "常識性 (I)", + "commonsensicalityII": "常識性 (II)" + }, + "variants": "各スコアは2つのバリアントで報告されています。バリアント (I) では、人間の多数派は人間の評価だけから決まります。バリアント (II) では、モデルの評価を含めたうえで多数派を再計算します。スコアはパーセントで、括弧内の数字はその列における各モデルの順位を示します。", + "modelDetailsPrefix": "追加のモデル比較については、", + "modelDetailsSuffix": "をご覧ください。", + "methodDetailsPrefix": "方法と数式については、", + "methodDetailsSuffix": "をご覧ください。", + "paperLink": "研究論文", + "calculationTitle": "計算メモ", + "calculationIntro": "この表では、2つのバリアントについて次の多数派評価の定義を使用しています。", + "variantIFormulaLabel": "バリアント (I):人間のみの多数派", + "variantIIFormulaLabel": "バリアント (II):モデル評価を含む多数派", + "humanShareDefinition": "文 i に同意した人間評価者の割合。", + "omegaDefinition": "文 i に割り当てられた人間評価者の集合。", + "alphaDefinition": "文 i に対するモデルの評価。", + "ratingsDefinition": "文 i に対する人間評価者 j の評価。", + "indicatorDefinition": "条件が真なら 1、そうでなければ 0 を返す指示関数。" } } diff --git a/client/public/locales/pt/translation.json b/client/public/locales/pt/translation.json index 10e4a73..6d40c0a 100644 --- a/client/public/locales/pt/translation.json +++ b/client/public/locales/pt/translation.json @@ -98,7 +98,8 @@ "dashboard": "Painel de controle", "people": "Pessoas", "research": "Pesquisa", - "signin": "Entrar" + "signin": "Entrar", + "llmEvals": "Avaliacoes de LLM" }, "navbar.participate →": "Participar →", "publications": { @@ -339,5 +340,53 @@ "question": "Um cão e um gato pesam 100 libras no total. O cão pesa 86 libras. Qual é a diferença de peso entre o cão e o gato?", "intuitiveExplanation": "86 lbs (o peso do cão) parece óbvio" } + }, + "llmEvals": { + "eyebrow": "Benchmark de senso comum", + "title": "Avaliacoes de senso comum de LLM", + "intro": "Esta tabela compara modelos de linguagem em tres medidas de senso comum no nivel individual: consenso, consciencia e commonsensicalidade.", + "methodology": "O consenso mede se um modelo concorda com a maioria humana para cada afirmacao. A consciencia mede se ele prediz corretamente o que a maioria das pessoas pensaria. A commonsensicalidade combina esses dois sinais.", + "detailsPrefix": "Para mais detalhes sobre a estrutura de avaliacao e os resultados, visite", + "detailsLink": "a pagina de demonstracao de senso comum de LLM", + "detailsSuffix": ".", + "modelsEvaluated": "Modelos avaliados", + "topCommonsensicalityII": "Maior commonsensicalidade (II)", + "scoreRank": "score {{score}}, posicao {{rank}}", + "lastUpdated": "Ultima atualizacao", + "tableTitle": "Resultados da avaliacao", + "tableHelp": "Pesquise por modelo, filtre por um score minimo de metrica ou clique em qualquer cabecalho de coluna para ordenar.", + "modelFilter": "Modelo", + "searchPlaceholder": "Pesquisar modelos", + "scoreMetric": "Metrica de score", + "minimumScore": "Score minimo", + "anyPlaceholder": "Qualquer", + "caption": "Scores de avaliacao de senso comum de LLM", + "showing": "Mostrando {{shown}} de {{total}} modelos", + "scoreNote": "O score e mostrado como valor com a posicao entre parenteses.", + "columns": { + "model": "Modelo", + "added": "Adicionado", + "consensusI": "Consenso (I)", + "consensusII": "Consenso (II)", + "awarenessI": "Consciencia (I)", + "awarenessII": "Consciencia (II)", + "commonsensicalityI": "Commonsensicalidade (I)", + "commonsensicalityII": "Commonsensicalidade (II)" + }, + "variants": "Cada score e apresentado em duas variantes. Na variante (I), a maioria humana e determinada apenas pelas avaliacoes humanas. Na variante (II), a maioria e recalculada depois de incluir a avaliacao do modelo. Os scores sao porcentagens, e os numeros entre parenteses mostram o ranking de cada modelo naquela coluna.", + "modelDetailsPrefix": "Para comparacoes adicionais de modelos, visite", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "Para metodologia e formulas, veja", + "methodDetailsSuffix": ".", + "paperLink": "o artigo de pesquisa", + "calculationTitle": "Notas de calculo", + "calculationIntro": "A tabela usa as seguintes definicoes de avaliacao majoritaria para as duas variantes.", + "variantIFormulaLabel": "Variante (I): maioria apenas humana", + "variantIIFormulaLabel": "Variante (II): maioria incluindo a avaliacao do modelo", + "humanShareDefinition": "A proporcao de avaliadores humanos que concordam com a afirmacao i.", + "omegaDefinition": "O conjunto de avaliadores humanos atribuidos a afirmacao i.", + "alphaDefinition": "A avaliacao do modelo para a afirmacao i.", + "ratingsDefinition": "A avaliacao do avaliador humano j para a afirmacao i.", + "indicatorDefinition": "Uma funcao indicadora que retorna 1 quando a condicao e verdadeira e 0 caso contrario." } } diff --git a/client/public/locales/ru/translation.json b/client/public/locales/ru/translation.json index 6e7a645..6469c15 100644 --- a/client/public/locales/ru/translation.json +++ b/client/public/locales/ru/translation.json @@ -98,7 +98,8 @@ "dashboard": "Панель управления", "people": "Люди", "research": "Исследование", - "signin": "Войти" + "signin": "Войти", + "llmEvals": "Оценки LLM" }, "navbar.participate →": "Участвовать →", "publications": { @@ -339,5 +340,53 @@ "question": "Собака и кошка вместе весят 100 фунтов. Собака весит 86 фунтов. Какова разница в весе между собакой и кошкой?", "intuitiveExplanation": "Вес собаки (86 фунтов) кажется очевидным" } + }, + "llmEvals": { + "eyebrow": "Бенчмарк здравого смысла", + "title": "Оценки здравого смысла LLM", + "intro": "Эта таблица сравнивает языковые модели по трем индивидуальным метрикам здравого смысла: консенсусу, осведомленности и здравомыслию.", + "methodology": "Консенсус показывает, согласна ли модель с человеческим большинством по каждому утверждению. Осведомленность показывает, правильно ли модель предсказывает, что думает большинство людей. Здравомыслие объединяет эти два сигнала.", + "detailsPrefix": "Подробнее о методике оценки и результатах см.", + "detailsLink": "демо-страницу здравого смысла LLM", + "detailsSuffix": ".", + "modelsEvaluated": "Оцененные модели", + "topCommonsensicalityII": "Лучшее здравомыслие (II)", + "scoreRank": "оценка {{score}}, место {{rank}}", + "lastUpdated": "Последнее обновление", + "tableTitle": "Результаты оценки", + "tableHelp": "Ищите по модели, фильтруйте по минимальному значению метрики или нажмите заголовок любого столбца для сортировки.", + "modelFilter": "Модель", + "searchPlaceholder": "Поиск моделей", + "scoreMetric": "Метрика оценки", + "minimumScore": "Минимальная оценка", + "anyPlaceholder": "Любая", + "caption": "Оценки здравого смысла LLM", + "showing": "Показано {{shown}} из {{total}} моделей", + "scoreNote": "Оценка показана как значение, а место указано в скобках.", + "columns": { + "model": "Модель", + "added": "Добавлено", + "consensusI": "Консенсус (I)", + "consensusII": "Консенсус (II)", + "awarenessI": "Осведомленность (I)", + "awarenessII": "Осведомленность (II)", + "commonsensicalityI": "Здравомыслие (I)", + "commonsensicalityII": "Здравомыслие (II)" + }, + "variants": "Каждая оценка приведена в двух вариантах. В варианте (I) человеческое большинство определяется только по человеческим оценкам. В варианте (II) большинство пересчитывается после включения оценки модели. Оценки являются процентами, а числа в скобках показывают место каждой модели в соответствующем столбце.", + "modelDetailsPrefix": "Для дополнительных сравнений моделей см.", + "modelDetailsSuffix": ".", + "methodDetailsPrefix": "Методику и формулы см. в", + "methodDetailsSuffix": ".", + "paperLink": "исследовательской статье", + "calculationTitle": "Примечания к расчету", + "calculationIntro": "В таблице используются следующие определения рейтинга большинства для двух вариантов.", + "variantIFormulaLabel": "Вариант (I): только человеческое большинство", + "variantIIFormulaLabel": "Вариант (II): большинство с учетом оценки модели", + "humanShareDefinition": "Доля человеческих оценщиков, согласных с утверждением i.", + "omegaDefinition": "Множество человеческих оценщиков, назначенных утверждению i.", + "alphaDefinition": "Оценка модели для утверждения i.", + "ratingsDefinition": "Оценка человеческого оценщика j для утверждения i.", + "indicatorDefinition": "Индикаторная функция, которая возвращает 1, если условие истинно, и 0 в противном случае." } } diff --git a/client/public/locales/zh/translation.json b/client/public/locales/zh/translation.json index f4918cf..491d7e9 100644 --- a/client/public/locales/zh/translation.json +++ b/client/public/locales/zh/translation.json @@ -98,7 +98,8 @@ "dashboard": "仪表板", "people": "人员", "research": "研究", - "signin": "登录" + "signin": "登录", + "llmEvals": "LLM 评测" }, "navbar.participate →": "参与 →", "publications": { @@ -339,5 +340,53 @@ "question": "一只狗和一只猫的总重量是 100 磅。狗的重量是 86 磅。狗和猫的重量差是多少?", "intuitiveExplanation": "86 磅(狗的重量) 看起来很直观" } + }, + "llmEvals": { + "eyebrow": "常识基准", + "title": "LLM 常识评测", + "intro": "本表从三个个体层面的常识指标比较语言模型:共识、认知和常识性。", + "methodology": "共识衡量模型是否在每个陈述上同意人类多数意见。认知衡量模型是否能正确预测大多数人的想法。常识性综合这两个信号。", + "detailsPrefix": "如需了解评估框架和结果的更多细节,请访问", + "detailsLink": "LLM 常识演示页面", + "detailsSuffix": "。", + "modelsEvaluated": "已评估模型", + "topCommonsensicalityII": "最高常识性 (II)", + "scoreRank": "{{score}} 分,排名 {{rank}}", + "lastUpdated": "最后更新", + "tableTitle": "评估结果", + "tableHelp": "按模型搜索,按最低指标分数筛选,或点击任意列标题排序。", + "modelFilter": "模型", + "searchPlaceholder": "搜索模型", + "scoreMetric": "分数指标", + "minimumScore": "最低分数", + "anyPlaceholder": "任意", + "caption": "LLM 常识评估分数", + "showing": "正在显示 {{total}} 个模型中的 {{shown}} 个", + "scoreNote": "分数显示为数值,括号中为排名。", + "columns": { + "model": "模型", + "added": "添加日期", + "consensusI": "共识 (I)", + "consensusII": "共识 (II)", + "awarenessI": "认知 (I)", + "awarenessII": "认知 (II)", + "commonsensicalityI": "常识性 (I)", + "commonsensicalityII": "常识性 (II)" + }, + "variants": "每个分数都在两个变体下报告。在变体 (I) 中,人类多数意见只由人类评分决定。在变体 (II) 中,加入模型评分后重新计算多数意见。分数为百分比,括号中的数字表示每个模型在该列中的排名。", + "modelDetailsPrefix": "如需更多模型比较,请访问", + "modelDetailsSuffix": "。", + "methodDetailsPrefix": "如需方法和公式,请参见", + "methodDetailsSuffix": "。", + "paperLink": "研究论文", + "calculationTitle": "计算说明", + "calculationIntro": "本表对两个变体使用以下多数评分定义。", + "variantIFormulaLabel": "变体 (I):仅人类多数意见", + "variantIIFormulaLabel": "变体 (II):包含模型评分的多数意见", + "humanShareDefinition": "同意陈述 i 的人类评分者比例。", + "omegaDefinition": "被分配到陈述 i 的人类评分者集合。", + "alphaDefinition": "模型对陈述 i 的评分。", + "ratingsDefinition": "人类评分者 j 对陈述 i 的评分。", + "indicatorDefinition": "指示函数:条件为真时返回 1,否则返回 0。" } } diff --git a/client/src/App.tsx b/client/src/App.tsx index ba98cd1..749c010 100644 --- a/client/src/App.tsx +++ b/client/src/App.tsx @@ -20,6 +20,7 @@ import SignIn from "./pages/SignIn"; import Welcome from "./pages/Welcome"; import Dashboard from "./pages/Dashboard"; import Finish from "./pages/Finish"; +import LlmEvals from "./pages/LlmEvals"; // components import Consent from "./components/Consent"; @@ -48,12 +49,12 @@ const App = () => { - } /> } /> } /> } /> + } /> } /> } /> } /> diff --git a/client/src/data/llmEvals.json b/client/src/data/llmEvals.json new file mode 100644 index 0000000..1368bf6 --- /dev/null +++ b/client/src/data/llmEvals.json @@ -0,0 +1,390 @@ +{ + "lastUpdated": "2026-05-11", + "rows": [ + { + "id": "claude-3-haiku", + "model": "Claude 3 Haiku", + "addedDate": "2026-05-11", + "consensusI": { "score": 58.8, "rank": 31 }, + "consensusII": { "score": 60.5, "rank": 31 }, + "awarenessI": { "score": 64.1, "rank": 30 }, + "awarenessII": { "score": 65.5, "rank": 30 }, + "commonsensicalityI": { "score": 61.4, "rank": 32 }, + "commonsensicalityII": { "score": 63.0, "rank": 32 } + }, + { + "id": "claude-3-sonnet", + "model": "Claude 3 Sonnet", + "addedDate": "2026-05-11", + "consensusI": { "score": 60.9, "rank": 30 }, + "consensusII": { "score": 62.6, "rank": 30 }, + "awarenessI": { "score": 62.2, "rank": 31 }, + "awarenessII": { "score": 63.7, "rank": 31 }, + "commonsensicalityI": { "score": 61.5, "rank": 31 }, + "commonsensicalityII": { "score": 63.1, "rank": 31 } + }, + { + "id": "claude-3-opus", + "model": "Claude 3 Opus", + "addedDate": "2026-05-11", + "consensusI": { "score": 73.4, "rank": 19 }, + "consensusII": { "score": 75.1, "rank": 20 }, + "awarenessI": { "score": 77.4, "rank": 15 }, + "awarenessII": { "score": 78.2, "rank": 15 }, + "commonsensicalityI": { "score": 75.4, "rank": 17 }, + "commonsensicalityII": { "score": 74.6, "rank": 17 } + }, + { + "id": "dbrx", + "model": "DBRX", + "addedDate": "2026-05-11", + "consensusI": { "score": 73.7, "rank": 18 }, + "consensusII": { "score": 75.2, "rank": 18 }, + "awarenessI": { "score": 79.0, "rank": 13 }, + "awarenessII": { "score": 79.9, "rank": 13 }, + "commonsensicalityI": { "score": 76.3, "rank": 15 }, + "commonsensicalityII": { "score": 77.5, "rank": 14 } + }, + { + "id": "falcon-7b", + "model": "Falcon-7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 66.6, "rank": 27 }, + "consensusII": { "score": 68.1, "rank": 27 }, + "awarenessI": { "score": 66.1, "rank": 29 }, + "awarenessII": { "score": 67.7, "rank": 28 }, + "commonsensicalityI": { "score": 66.3, "rank": 27 }, + "commonsensicalityII": { "score": 67.9, "rank": 27 } + }, + { + "id": "falcon-40b", + "model": "Falcon-40B", + "addedDate": "2026-05-11", + "consensusI": { "score": 73.0, "rank": 22 }, + "consensusII": { "score": 74.8, "rank": 22 }, + "awarenessI": { "score": 77.2, "rank": 16 }, + "awarenessII": { "score": 77.9, "rank": 17 }, + "commonsensicalityI": { "score": 75.1, "rank": 19 }, + "commonsensicalityII": { "score": 76.3, "rank": 20 } + }, + { + "id": "falcon-180b", + "model": "Falcon-180B", + "addedDate": "2026-05-11", + "consensusI": { "score": 78.6, "rank": 8 }, + "consensusII": { "score": 80.3, "rank": 8 }, + "awarenessI": { "score": 81.3, "rank": 6 }, + "awarenessII": { "score": 82.5, "rank": 5 }, + "commonsensicalityI": { "score": 79.9, "rank": 9 }, + "commonsensicalityII": { "score": 81.4, "rank": 9 } + }, + { + "id": "flan-t5-small", + "model": "Flan-T5-Small", + "addedDate": "2026-05-11", + "consensusI": { "score": 34.4, "rank": 35 }, + "consensusII": { "score": 36.1, "rank": 35 }, + "awarenessI": { "score": 33.9, "rank": 35 }, + "awarenessII": { "score": 35.6, "rank": 35 }, + "commonsensicalityI": { "score": 34.2, "rank": 35 }, + "commonsensicalityII": { "score": 35.8, "rank": 35 } + }, + { + "id": "flan-t5-base", + "model": "Flan-T5-Base", + "addedDate": "2026-05-11", + "consensusI": { "score": 56.8, "rank": 33 }, + "consensusII": { "score": 58.6, "rank": 33 }, + "awarenessI": { "score": 59.5, "rank": 33 }, + "awarenessII": { "score": 60.6, "rank": 33 }, + "commonsensicalityI": { "score": 58.1, "rank": 33 }, + "commonsensicalityII": { "score": 59.6, "rank": 33 } + }, + { + "id": "flan-t5-large", + "model": "Flan-T5-Large", + "addedDate": "2026-05-11", + "consensusI": { "score": 77.3, "rank": 14 }, + "consensusII": { "score": 78.9, "rank": 14 }, + "awarenessI": { "score": 76.5, "rank": 18 }, + "awarenessII": { "score": 77.9, "rank": 16 }, + "commonsensicalityI": { "score": 76.9, "rank": 12 }, + "commonsensicalityII": { "score": 78.4, "rank": 12 } + }, + { + "id": "flan-t5-xl", + "model": "Flan-T5-XL", + "addedDate": "2026-05-11", + "consensusI": { "score": 73.3, "rank": 20 }, + "consensusII": { "score": 75.0, "rank": 21 }, + "awarenessI": { "score": 72.7, "rank": 23 }, + "awarenessII": { "score": 74.1, "rank": 23 }, + "commonsensicalityI": { "score": 73.0, "rank": 23 }, + "commonsensicalityII": { "score": 74.6, "rank": 23 } + }, + { + "id": "flan-t5-xxl", + "model": "Flan-T5-XXL", + "addedDate": "2026-05-11", + "consensusI": { "score": 79.9, "rank": 6 }, + "consensusII": { "score": 81.6, "rank": 6 }, + "awarenessI": { "score": 80.9, "rank": 9 }, + "awarenessII": { "score": 82.1, "rank": 8 }, + "commonsensicalityI": { "score": 80.4, "rank": 7 }, + "commonsensicalityII": { "score": 81.8, "rank": 6 } + }, + { + "id": "gemma-2b", + "model": "Gemma-2B", + "addedDate": "2026-05-11", + "consensusI": { "score": 65.2, "rank": 29 }, + "consensusII": { "score": 66.8, "rank": 29 }, + "awarenessI": { "score": 66.6, "rank": 26 }, + "awarenessII": { "score": 67.9, "rank": 27 }, + "commonsensicalityI": { "score": 65.9, "rank": 28 }, + "commonsensicalityII": { "score": 67.3, "rank": 28 } + }, + { + "id": "gemma-7b", + "model": "Gemma-7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 73.2, "rank": 21 }, + "consensusII": { "score": 75.1, "rank": 19 }, + "awarenessI": { "score": 70.9, "rank": 25 }, + "awarenessII": { "score": 72.3, "rank": 25 }, + "commonsensicalityI": { "score": 72.0, "rank": 25 }, + "commonsensicalityII": { "score": 73.7, "rank": 25 } + }, + { + "id": "gemini-pro-1-0", + "model": "Gemini Pro 1.0", + "addedDate": "2026-05-11", + "consensusI": { "score": 78.4, "rank": 9 }, + "consensusII": { "score": 80.0, "rank": 9 }, + "awarenessI": { "score": 81.1, "rank": 7 }, + "awarenessII": { "score": 82.2, "rank": 7 }, + "commonsensicalityI": { "score": 79.7, "rank": 10 }, + "commonsensicalityII": { "score": 81.1, "rank": 10 } + }, + { + "id": "gpt-3-5", + "model": "GPT-3.5", + "addedDate": "2026-05-11", + "consensusI": { "score": 78.3, "rank": 10 }, + "consensusII": { "score": 80.0, "rank": 10 }, + "awarenessI": { "score": 75.4, "rank": 20 }, + "awarenessII": { "score": 76.3, "rank": 20 }, + "commonsensicalityI": { "score": 76.8, "rank": 13 }, + "commonsensicalityII": { "score": 78.1, "rank": 13 } + }, + { + "id": "gpt-4-0125", + "model": "GPT-4-0125", + "addedDate": "2026-05-11", + "consensusI": { "score": 77.6, "rank": 13 }, + "consensusII": { "score": 79.1, "rank": 13 }, + "awarenessI": { "score": 79.2, "rank": 12 }, + "awarenessII": { "score": 80.2, "rank": 12 }, + "commonsensicalityI": { "score": 78.4, "rank": 11 }, + "commonsensicalityII": { "score": 79.7, "rank": 11 } + }, + { + "id": "gpt-4-0409", + "model": "GPT-4-0409", + "addedDate": "2026-05-11", + "consensusI": { "score": 78.0, "rank": 11 }, + "consensusII": { "score": 79.4, "rank": 11 }, + "awarenessI": { "score": 83.2, "rank": 3 }, + "awarenessII": { "score": 84.2, "rank": 2 }, + "commonsensicalityI": { "score": 80.6, "rank": 5 }, + "commonsensicalityII": { "score": 81.8, "rank": 7 } + }, + { + "id": "gpt-4o", + "model": "GPT-4o", + "addedDate": "2026-05-11", + "consensusI": { "score": 72.5, "rank": 23 }, + "consensusII": { "score": 74.2, "rank": 23 }, + "awarenessI": { "score": 77.9, "rank": 14 }, + "awarenessII": { "score": 78.8, "rank": 14 }, + "commonsensicalityI": { "score": 75.2, "rank": 18 }, + "commonsensicalityII": { "score": 76.4, "rank": 18 } + }, + { + "id": "gpt-5", + "model": "GPT-5", + "addedDate": "2026-05-11", + "consensusI": { "score": 71.9, "rank": 25 }, + "consensusII": { "score": 73.5, "rank": 25 }, + "awarenessI": { "score": 79.6, "rank": 11 }, + "awarenessII": { "score": 80.6, "rank": 11 }, + "commonsensicalityI": { "score": 75.7, "rank": 16 }, + "commonsensicalityII": { "score": 77.0, "rank": 16 } + }, + { + "id": "llama-2-7b", + "model": "LLaMA-2-7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 74.0, "rank": 17 }, + "consensusII": { "score": 75.6, "rank": 17 }, + "awarenessI": { "score": 76.0, "rank": 19 }, + "awarenessII": { "score": 77.2, "rank": 19 }, + "commonsensicalityI": { "score": 75.0, "rank": 20 }, + "commonsensicalityII": { "score": 76.4, "rank": 19 } + }, + { + "id": "llama-2-13b-a", + "model": "LLaMA-2-13B", + "addedDate": "2026-05-11", + "consensusI": { "score": 57.2, "rank": 32 }, + "consensusII": { "score": 58.9, "rank": 32 }, + "awarenessI": { "score": 66.5, "rank": 27 }, + "awarenessII": { "score": 67.7, "rank": 28 }, + "commonsensicalityI": { "score": 61.7, "rank": 30 }, + "commonsensicalityII": { "score": 63.2, "rank": 30 } + }, + { + "id": "llama-2-13b-b", + "model": "LLaMA-2-13B", + "addedDate": "2026-05-11", + "consensusI": { "score": 48.5, "rank": 34 }, + "consensusII": { "score": 50.3, "rank": 34 }, + "awarenessI": { "score": 44.5, "rank": 34 }, + "awarenessII": { "score": 46.1, "rank": 34 }, + "commonsensicalityI": { "score": 46.5, "rank": 34 }, + "commonsensicalityII": { "score": 48.1, "rank": 34 } + }, + { + "id": "llama-2-70b", + "model": "LLaMA-2-70B", + "addedDate": "2026-05-11", + "consensusI": { "score": 65.7, "rank": 28 }, + "consensusII": { "score": 67.4, "rank": 28 }, + "awarenessI": { "score": 61.4, "rank": 32 }, + "awarenessII": { "score": 62.4, "rank": 32 }, + "commonsensicalityI": { "score": 63.5, "rank": 29 }, + "commonsensicalityII": { "score": 64.9, "rank": 29 } + }, + { + "id": "llama-3-70b", + "model": "LLaMA-3-70B", + "addedDate": "2026-05-11", + "consensusI": { "score": 72.0, "rank": 24 }, + "consensusII": { "score": 73.6, "rank": 24 }, + "awarenessI": { "score": 76.8, "rank": 17 }, + "awarenessII": { "score": 77.8, "rank": 18 }, + "commonsensicalityI": { "score": 74.4, "rank": 22 }, + "commonsensicalityII": { "score": 75.7, "rank": 22 } + }, + { + "id": "mistral-7b", + "model": "Mistral-7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 80.2, "rank": 5 }, + "consensusII": { "score": 81.7, "rank": 5 }, + "awarenessI": { "score": 80.7, "rank": 10 }, + "awarenessII": { "score": 82.1, "rank": 9 }, + "commonsensicalityI": { "score": 80.4, "rank": 6 }, + "commonsensicalityII": { "score": 81.9, "rank": 5 } + }, + { + "id": "mixtral-8x7b", + "model": "Mixtral-8x7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 77.8, "rank": 12 }, + "consensusII": { "score": 79.4, "rank": 12 }, + "awarenessI": { "score": 75.0, "rank": 21 }, + "awarenessII": { "score": 75.7, "rank": 21 }, + "commonsensicalityI": { "score": 76.4, "rank": 14 }, + "commonsensicalityII": { "score": 77.5, "rank": 15 } + }, + { + "id": "mixtral-8x22b", + "model": "Mixtral-8x22B", + "addedDate": "2026-05-11", + "consensusI": { "score": 80.7, "rank": 1 }, + "consensusII": { "score": 82.2, "rank": 1 }, + "awarenessI": { "score": 84.0, "rank": 1 }, + "awarenessII": { "score": 84.9, "rank": 1 }, + "commonsensicalityI": { "score": 82.3, "rank": 1 }, + "commonsensicalityII": { "score": 83.6, "rank": 1 } + }, + { + "id": "mistral-large", + "model": "Mistral-Large", + "addedDate": "2026-05-11", + "consensusI": { "score": 80.4, "rank": 4 }, + "consensusII": { "score": 82.0, "rank": 4 }, + "awarenessI": { "score": 82.2, "rank": 3 }, + "awarenessII": { "score": 83.4, "rank": 3 }, + "commonsensicalityI": { "score": 81.3, "rank": 2 }, + "commonsensicalityII": { "score": 82.7, "rank": 2 } + }, + { + "id": "olmo-7b", + "model": "OLMo-7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 74.3, "rank": 16 }, + "consensusII": { "score": 75.9, "rank": 16 }, + "awarenessI": { "score": 71.0, "rank": 24 }, + "awarenessII": { "score": 72.3, "rank": 24 }, + "commonsensicalityI": { "score": 72.7, "rank": 24 }, + "commonsensicalityII": { "score": 74.1, "rank": 24 } + }, + { + "id": "qwen2-0-5b", + "model": "Qwen2-0.5B", + "addedDate": "2026-05-11", + "consensusI": { "score": 67.1, "rank": 26 }, + "consensusII": { "score": 68.7, "rank": 26 }, + "awarenessI": { "score": 66.5, "rank": 27 }, + "awarenessII": { "score": 67.9, "rank": 26 }, + "commonsensicalityI": { "score": 66.8, "rank": 26 }, + "commonsensicalityII": { "score": 68.3, "rank": 26 } + }, + { + "id": "qwen2-1-5b", + "model": "Qwen2-1.5B", + "addedDate": "2026-05-11", + "consensusI": { "score": 75.4, "rank": 15 }, + "consensusII": { "score": 76.8, "rank": 15 }, + "awarenessI": { "score": 73.8, "rank": 22 }, + "awarenessII": { "score": 74.9, "rank": 22 }, + "commonsensicalityI": { "score": 74.6, "rank": 21 }, + "commonsensicalityII": { "score": 75.8, "rank": 21 } + }, + { + "id": "qwen2-7b", + "model": "Qwen2-7B", + "addedDate": "2026-05-11", + "consensusI": { "score": 79.7, "rank": 7 }, + "consensusII": { "score": 81.2, "rank": 7 }, + "awarenessI": { "score": 81.1, "rank": 8 }, + "awarenessII": { "score": 82.0, "rank": 10 }, + "commonsensicalityI": { "score": 80.4, "rank": 8 }, + "commonsensicalityII": { "score": 81.6, "rank": 8 } + }, + { + "id": "qwen2-57b", + "model": "Qwen2-57B", + "addedDate": "2026-05-11", + "consensusI": { "score": 80.4, "rank": 3 }, + "consensusII": { "score": 82.1, "rank": 2 }, + "awarenessI": { "score": 81.4, "rank": 5 }, + "awarenessII": { "score": 82.4, "rank": 6 }, + "commonsensicalityI": { "score": 80.9, "rank": 4 }, + "commonsensicalityII": { "score": 82.3, "rank": 4 } + }, + { + "id": "qwen2-72b", + "model": "Qwen2-72B", + "addedDate": "2026-05-11", + "consensusI": { "score": 80.5, "rank": 2 }, + "consensusII": { "score": 82.1, "rank": 3 }, + "awarenessI": { "score": 81.8, "rank": 4 }, + "awarenessII": { "score": 82.8, "rank": 4 }, + "commonsensicalityI": { "score": 81.1, "rank": 3 }, + "commonsensicalityII": { "score": 82.5, "rank": 3 } + } + ] +} diff --git a/client/src/pages/LlmEvals.tsx b/client/src/pages/LlmEvals.tsx new file mode 100644 index 0000000..3d3ab88 --- /dev/null +++ b/client/src/pages/LlmEvals.tsx @@ -0,0 +1,481 @@ +import React, { useMemo, useState } from "react"; +import { useTranslation } from "react-i18next"; + +import evalData from "../data/llmEvals.json"; +import Navbar from "../partials/NavBar"; +import Footer from "../partials/Footer"; + +type MetricKey = + | "consensusI" + | "consensusII" + | "awarenessI" + | "awarenessII" + | "commonsensicalityI" + | "commonsensicalityII"; + +type SortKey = "model" | "addedDate" | MetricKey; +type SortDirection = "asc" | "desc"; + +interface MetricScore { + score: number; + rank: number; +} + +interface LlmEvalRow extends Record { + id: string; + model: string; + addedDate: string; +} + +interface LlmEvalData { + lastUpdated: string; + rows: LlmEvalRow[]; +} + +const data = evalData as LlmEvalData; + +const metricColumns: { key: MetricKey; labelKey: string }[] = [ + { key: "consensusI", labelKey: "llmEvals.columns.consensusI" }, + { key: "consensusII", labelKey: "llmEvals.columns.consensusII" }, + { key: "awarenessI", labelKey: "llmEvals.columns.awarenessI" }, + { key: "awarenessII", labelKey: "llmEvals.columns.awarenessII" }, + { + key: "commonsensicalityI", + labelKey: "llmEvals.columns.commonsensicalityI", + }, + { + key: "commonsensicalityII", + labelKey: "llmEvals.columns.commonsensicalityII", + }, +]; + +const formatDate = (date: string, locale: string) => + new Intl.DateTimeFormat(locale, { + month: "long", + day: "numeric", + year: "numeric", + }).format(new Date(`${date}T00:00:00`)); + +const LlmEvals: React.FC = () => { + const { t, i18n } = useTranslation(); + const locale = i18n.resolvedLanguage || i18n.language || "en"; + const [query, setQuery] = useState(""); + const [metricFilter, setMetricFilter] = useState( + "commonsensicalityII", + ); + const [minimumScore, setMinimumScore] = useState(""); + const [sortConfig, setSortConfig] = useState<{ + key: SortKey; + direction: SortDirection; + }>({ + key: "commonsensicalityII", + direction: "desc", + }); + + const topModel = useMemo( + () => + [...data.rows].sort( + (a, b) => + b.commonsensicalityII.score - a.commonsensicalityII.score || + a.commonsensicalityII.rank - b.commonsensicalityII.rank, + )[0], + [], + ); + + const filteredRows = useMemo(() => { + const normalizedQuery = query.trim().toLowerCase(); + const parsedMinimum = Number.parseFloat(minimumScore); + const hasMinimum = !Number.isNaN(parsedMinimum); + + return data.rows.filter((row) => { + const matchesQuery = + normalizedQuery.length === 0 || + row.model.toLowerCase().includes(normalizedQuery); + const matchesMinimum = + !hasMinimum || row[metricFilter].score >= parsedMinimum; + + return matchesQuery && matchesMinimum; + }); + }, [metricFilter, minimumScore, query]); + + const sortedRows = useMemo(() => { + return [...filteredRows].sort((a, b) => { + if (sortConfig.key === "model") { + const compared = a.model.localeCompare(b.model); + return sortConfig.direction === "asc" ? compared : -compared; + } + + if (sortConfig.key === "addedDate") { + const compared = + a.addedDate.localeCompare(b.addedDate) || + a.model.localeCompare(b.model); + return sortConfig.direction === "asc" ? compared : -compared; + } + + const compared = + a[sortConfig.key].score - b[sortConfig.key].score || + b[sortConfig.key].rank - a[sortConfig.key].rank; + + return sortConfig.direction === "asc" ? compared : -compared; + }); + }, [filteredRows, sortConfig]); + + const handleSort = (key: SortKey) => { + setSortConfig((current) => ({ + key, + direction: + current.key === key && current.direction === "desc" ? "asc" : "desc", + })); + }; + + const renderSortMarker = (key: SortKey) => { + if (sortConfig.key !== key) return ""; + return sortConfig.direction === "asc" ? " ^" : " v"; + }; + + const renderMetric = (metric: MetricScore) => ( + + + {metric.score.toFixed(1)} + + + ({metric.rank}) + + + ); + + const formulaClass = + "mt-2 overflow-x-auto rounded border border-gray-200 p-4 text-sm text-gray-800 dark:border-gray-700 dark:text-gray-100"; + const mathClass = + "inline-flex min-w-max items-center gap-2 font-serif leading-8"; + const fractionClass = "inline-flex flex-col items-center px-1 align-middle"; + + return ( +
+ + +
+
+
+
+
+

+ {t("llmEvals.eyebrow")} +

+

+ {t("llmEvals.title")} +

+

+ {t("llmEvals.intro")} +

+

+ {t("llmEvals.methodology")} +

+

+ {t("llmEvals.variants")} +

+

+ {t("llmEvals.modelDetailsPrefix")}{" "} + + {t("llmEvals.detailsLink")} + + {t("llmEvals.modelDetailsSuffix")}{" "} + {t("llmEvals.methodDetailsPrefix")}{" "} + + {t("llmEvals.paperLink")} + + {t("llmEvals.methodDetailsSuffix")} +

+
+ +
+
+

+ {t("llmEvals.modelsEvaluated")} +

+

+ {data.rows.length} +

+
+
+

+ {t("llmEvals.topCommonsensicalityII")} +

+

+ {topModel.model} +

+

+ {t("llmEvals.scoreRank", { + score: topModel.commonsensicalityII.score.toFixed(1), + rank: topModel.commonsensicalityII.rank, + })} +

+
+
+

+ {t("llmEvals.lastUpdated")} +

+

+ {formatDate(data.lastUpdated, locale)} +

+
+
+ +
+
+
+
+

+ {t("llmEvals.tableTitle")} +

+

+ {t("llmEvals.tableHelp")} +

+
+ +
+ + + + + +
+
+
+ +
+ + + + + + + {metricColumns.map((column) => ( + + ))} + + + + {sortedRows.map((row) => ( + + + + {metricColumns.map((column) => ( + + ))} + + ))} + +
+ {t("llmEvals.caption")} +
+ + + + + +
+ {row.model} + + {formatDate(row.addedDate, locale)} + + {renderMetric(row[column.key])} +
+
+ +
+ + {t("llmEvals.showing", { + shown: sortedRows.length, + total: data.rows.length, + })} + + {t("llmEvals.scoreNote")} +
+ +
+

+ {t("llmEvals.calculationTitle")} +

+

+ {t("llmEvals.calculationIntro")} +

+ +
+
+

+ {t("llmEvals.variantIFormulaLabel")} +

+
+ + + majorityi + h + + = + + 1[di + h,a ≥ 0.5] + + +
+
+
+

+ {t("llmEvals.variantIIFormulaLabel")} +

+
+ + + majorityi + h + + = + 1[ + + + αi + m + ∑ + j ∈ Ωi A + i,j + + + |Ωi| + 1 + + + ≥ 0.5] + +
+
+
+ +
+
+
+ d_i^{"{h,a}"} +
+
{t("llmEvals.humanShareDefinition")}
+
+
+
+ Omega_i +
+
{t("llmEvals.omegaDefinition")}
+
+
+
+ alpha_i^m +
+
{t("llmEvals.alphaDefinition")}
+
+
+
+ A_i,j +
+
{t("llmEvals.ratingsDefinition")}
+
+
+
+ 1[...] +
+
{t("llmEvals.indicatorDefinition")}
+
+
+
+
+
+
+
+
+ +
+
+ ); +}; + +export default LlmEvals; diff --git a/client/src/partials/NavBar.tsx b/client/src/partials/NavBar.tsx index 31ec6a1..f2b68a4 100644 --- a/client/src/partials/NavBar.tsx +++ b/client/src/partials/NavBar.tsx @@ -19,7 +19,7 @@ const Navbar: React.FC = () => { debounce(() => { setTop(window.scrollY < 10); }, 100), // You can adjust the debounce time as needed - [] + [], ); useEffect(() => { @@ -62,6 +62,9 @@ const Navbar: React.FC = () => {
  • {t("navbar.research")}
  • +
  • + {t("navbar.llmEvals")} +
  • {!user ? ( {t("navbar.signin")} @@ -90,6 +93,11 @@ const Navbar: React.FC = () => { {t("navbar.research")}
  • +
  • + + {t("navbar.llmEvals")} + +
  • {!user ? ( From 63699a784495f8b44f172fb6ea91805137705497 Mon Sep 17 00:00:00 2001 From: Josh Nguyen Date: Mon, 18 May 2026 13:58:23 -0400 Subject: [PATCH 2/2] Add LLM Eval page --- client/public/llm-evals.html | 2273 +++++++++++++++++++++++++++++++++ client/src/pages/LlmEvals.tsx | 482 +------ 2 files changed, 2279 insertions(+), 476 deletions(-) create mode 100644 client/public/llm-evals.html diff --git a/client/public/llm-evals.html b/client/public/llm-evals.html new file mode 100644 index 0000000..abe8b5d --- /dev/null +++ b/client/public/llm-evals.html @@ -0,0 +1,2273 @@ + + + + + +Common Sense Explorer + + + + +
    + ← Home +

    Common Sense Explorer

    + +
    + + +
    +
    +
    + +

    Measuring Common Sense in Humans and LLMs

    +

    This website presents the results in the following paper.

    +
    + Tuan Dung Nguyen, Duncan J. Watts, and Mark E. Whiting. "A Large-Scale Evaluation of Commonsense Knowledge in Humans and Large Language Models." PNAS Nexus 5(3): pgag029 (2026). DOI: 10.1093/pnasnexus/pgag029. +
    + +
    @article{nguyenLargescaleEvaluationCommonsense2026,
    +  title = {A Large-Scale Evaluation of Commonsense Knowledge in Humans and Large Language Models},
    +  author = {Nguyen, Tuan Dung and Watts, Duncan J and Whiting, Mark E},
    +  year = 2026,
    +  journal = {PNAS Nexus},
    +  volume = {5},
    +  number = {3},
    +  pages = {pgag029},
    +  publisher = {Oxford University Press},
    +  issn = {2752-6542},
    +  doi = {10.1093/pnasnexus/pgag029},
    +  url = {https://academic.oup.com/pnasnexus/article/doi/10.1093/pnasnexus/pgag029/8487345},
    +  copyright = {https://creativecommons.org/licenses/by/4.0/},
    +}
    +

    Model ranking

    +

    The following models have been evaluated. For more details, see the next section.

    +
    +

    Evaluation settings

    +
    + Statement: “Eighty percent of success is showing up.”...(a) Agree?(b) Othersagree?...Avg.Avg.57%67%Statement: “An accountant is good at mathematics and logic.”(a) Agree?(b) Othersagree?89%100%B. LLM as survey respondentC. LLM as simulator of respondent population 90%95%79%85%A. Human survey respondents𝑝(“yes”)𝑝(“yes”)LLMLLM +
    +

    For every statement, humans and LLMs are asked to indicate (a) whether they agree with it and (b) whether they think most other people would agree with it. In panel A, a total of N = 2,046 human participants were recruited to perform this task. The "Avg." column denotes the percentage of people who answered "yes" to the corresponding question. In panel B, we treat each LLM (in a total of N = 35 models) as an independent survey respondent, just like every human in panel A. This gives rise to the individual-level view of common sense, in which this model is measured based on its agreement with the majority of other people on every statement. In panel C, we treat every LLM's probability in its output answer as the average response of a hypothetical population of "silicon samples" (depicted as robots). For instance, if the LLM agrees with the statement "Eighty percent of success is showing up" with 90% probability, we interpret this as 90% of the silicon samples would agree with this statement. This gives rise to the statement-level metric of common sense, which is used to measure the correlation between the human (panel A) and silicon sample (panel C) populations

    + +

    Human common sense

    +

    From panel A in the figure above, we use the statement ratings of each individual to calculate their commonsensicality score. To see each individual's ratings and their commonsensicality score, check out the Human Individual panel at the top.

    + +

    LLMs as Individuals

    +

    From panel B in the figure above, we use the statement ratings of each LLM to calcualte its commonsensicality score. To see the ratings and commonsensicality score, check out the LLM as Individual panel at the top.

    + +

    LLM as a Generator of a Population

    +

    From panel C in the figure above, we use each LLM as a generator of a hypothetical population. Each individual in this population rates statements like every human does. Then, for this population, we calculate a commonsensicality score for each statement. Thus, every statement has a commonsensicality score for each population, including humans and every population generated by every LLM.

    +

    In the Statement Commonsensicality panel at the top, you can browse all statements and their commonsensicality in every population.

    +

    In the Statement Score Correlation panel, you can compare every LLM-generated population with the human population. This comparison is based on the Pearson correlation in statement commonsensicality score in the two populations.

    +
    +
    +
    + + + + + + + + + + + + + +
    + + + diff --git a/client/src/pages/LlmEvals.tsx b/client/src/pages/LlmEvals.tsx index 3d3ab88..07edce5 100644 --- a/client/src/pages/LlmEvals.tsx +++ b/client/src/pages/LlmEvals.tsx @@ -1,481 +1,11 @@ -import React, { useMemo, useState } from "react"; -import { useTranslation } from "react-i18next"; +import { useEffect } from "react"; -import evalData from "../data/llmEvals.json"; -import Navbar from "../partials/NavBar"; -import Footer from "../partials/Footer"; +const LlmEvals = () => { + useEffect(() => { + window.location.replace("/llm-evals.html"); + }, []); -type MetricKey = - | "consensusI" - | "consensusII" - | "awarenessI" - | "awarenessII" - | "commonsensicalityI" - | "commonsensicalityII"; - -type SortKey = "model" | "addedDate" | MetricKey; -type SortDirection = "asc" | "desc"; - -interface MetricScore { - score: number; - rank: number; -} - -interface LlmEvalRow extends Record { - id: string; - model: string; - addedDate: string; -} - -interface LlmEvalData { - lastUpdated: string; - rows: LlmEvalRow[]; -} - -const data = evalData as LlmEvalData; - -const metricColumns: { key: MetricKey; labelKey: string }[] = [ - { key: "consensusI", labelKey: "llmEvals.columns.consensusI" }, - { key: "consensusII", labelKey: "llmEvals.columns.consensusII" }, - { key: "awarenessI", labelKey: "llmEvals.columns.awarenessI" }, - { key: "awarenessII", labelKey: "llmEvals.columns.awarenessII" }, - { - key: "commonsensicalityI", - labelKey: "llmEvals.columns.commonsensicalityI", - }, - { - key: "commonsensicalityII", - labelKey: "llmEvals.columns.commonsensicalityII", - }, -]; - -const formatDate = (date: string, locale: string) => - new Intl.DateTimeFormat(locale, { - month: "long", - day: "numeric", - year: "numeric", - }).format(new Date(`${date}T00:00:00`)); - -const LlmEvals: React.FC = () => { - const { t, i18n } = useTranslation(); - const locale = i18n.resolvedLanguage || i18n.language || "en"; - const [query, setQuery] = useState(""); - const [metricFilter, setMetricFilter] = useState( - "commonsensicalityII", - ); - const [minimumScore, setMinimumScore] = useState(""); - const [sortConfig, setSortConfig] = useState<{ - key: SortKey; - direction: SortDirection; - }>({ - key: "commonsensicalityII", - direction: "desc", - }); - - const topModel = useMemo( - () => - [...data.rows].sort( - (a, b) => - b.commonsensicalityII.score - a.commonsensicalityII.score || - a.commonsensicalityII.rank - b.commonsensicalityII.rank, - )[0], - [], - ); - - const filteredRows = useMemo(() => { - const normalizedQuery = query.trim().toLowerCase(); - const parsedMinimum = Number.parseFloat(minimumScore); - const hasMinimum = !Number.isNaN(parsedMinimum); - - return data.rows.filter((row) => { - const matchesQuery = - normalizedQuery.length === 0 || - row.model.toLowerCase().includes(normalizedQuery); - const matchesMinimum = - !hasMinimum || row[metricFilter].score >= parsedMinimum; - - return matchesQuery && matchesMinimum; - }); - }, [metricFilter, minimumScore, query]); - - const sortedRows = useMemo(() => { - return [...filteredRows].sort((a, b) => { - if (sortConfig.key === "model") { - const compared = a.model.localeCompare(b.model); - return sortConfig.direction === "asc" ? compared : -compared; - } - - if (sortConfig.key === "addedDate") { - const compared = - a.addedDate.localeCompare(b.addedDate) || - a.model.localeCompare(b.model); - return sortConfig.direction === "asc" ? compared : -compared; - } - - const compared = - a[sortConfig.key].score - b[sortConfig.key].score || - b[sortConfig.key].rank - a[sortConfig.key].rank; - - return sortConfig.direction === "asc" ? compared : -compared; - }); - }, [filteredRows, sortConfig]); - - const handleSort = (key: SortKey) => { - setSortConfig((current) => ({ - key, - direction: - current.key === key && current.direction === "desc" ? "asc" : "desc", - })); - }; - - const renderSortMarker = (key: SortKey) => { - if (sortConfig.key !== key) return ""; - return sortConfig.direction === "asc" ? " ^" : " v"; - }; - - const renderMetric = (metric: MetricScore) => ( - - - {metric.score.toFixed(1)} - - - ({metric.rank}) - - - ); - - const formulaClass = - "mt-2 overflow-x-auto rounded border border-gray-200 p-4 text-sm text-gray-800 dark:border-gray-700 dark:text-gray-100"; - const mathClass = - "inline-flex min-w-max items-center gap-2 font-serif leading-8"; - const fractionClass = "inline-flex flex-col items-center px-1 align-middle"; - - return ( -
    - - -
    -
    -
    -
    -
    -

    - {t("llmEvals.eyebrow")} -

    -

    - {t("llmEvals.title")} -

    -

    - {t("llmEvals.intro")} -

    -

    - {t("llmEvals.methodology")} -

    -

    - {t("llmEvals.variants")} -

    -

    - {t("llmEvals.modelDetailsPrefix")}{" "} - - {t("llmEvals.detailsLink")} - - {t("llmEvals.modelDetailsSuffix")}{" "} - {t("llmEvals.methodDetailsPrefix")}{" "} - - {t("llmEvals.paperLink")} - - {t("llmEvals.methodDetailsSuffix")} -

    -
    - -
    -
    -

    - {t("llmEvals.modelsEvaluated")} -

    -

    - {data.rows.length} -

    -
    -
    -

    - {t("llmEvals.topCommonsensicalityII")} -

    -

    - {topModel.model} -

    -

    - {t("llmEvals.scoreRank", { - score: topModel.commonsensicalityII.score.toFixed(1), - rank: topModel.commonsensicalityII.rank, - })} -

    -
    -
    -

    - {t("llmEvals.lastUpdated")} -

    -

    - {formatDate(data.lastUpdated, locale)} -

    -
    -
    - -
    -
    -
    -
    -

    - {t("llmEvals.tableTitle")} -

    -

    - {t("llmEvals.tableHelp")} -

    -
    - -
    - - - - - -
    -
    -
    - -
    - - - - - - - {metricColumns.map((column) => ( - - ))} - - - - {sortedRows.map((row) => ( - - - - {metricColumns.map((column) => ( - - ))} - - ))} - -
    - {t("llmEvals.caption")} -
    - - - - - -
    - {row.model} - - {formatDate(row.addedDate, locale)} - - {renderMetric(row[column.key])} -
    -
    - -
    - - {t("llmEvals.showing", { - shown: sortedRows.length, - total: data.rows.length, - })} - - {t("llmEvals.scoreNote")} -
    - -
    -

    - {t("llmEvals.calculationTitle")} -

    -

    - {t("llmEvals.calculationIntro")} -

    - -
    -
    -

    - {t("llmEvals.variantIFormulaLabel")} -

    -
    - - - majorityi - h - - = - - 1[di - h,a ≥ 0.5] - - -
    -
    -
    -

    - {t("llmEvals.variantIIFormulaLabel")} -

    -
    - - - majorityi - h - - = - 1[ - - - αi - m + ∑ - j ∈ Ωi A - i,j - - - |Ωi| + 1 - - - ≥ 0.5] - -
    -
    -
    - -
    -
    -
    - d_i^{"{h,a}"} -
    -
    {t("llmEvals.humanShareDefinition")}
    -
    -
    -
    - Omega_i -
    -
    {t("llmEvals.omegaDefinition")}
    -
    -
    -
    - alpha_i^m -
    -
    {t("llmEvals.alphaDefinition")}
    -
    -
    -
    - A_i,j -
    -
    {t("llmEvals.ratingsDefinition")}
    -
    -
    -
    - 1[...] -
    -
    {t("llmEvals.indicatorDefinition")}
    -
    -
    -
    -
    -
    -
    -
    -
    - -
    - ); + return null; }; export default LlmEvals;