๐Ÿš€ Evalita Leaderboard ๐Ÿš€

Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.

{
  • "headers": [
    • "T",
    • "Model",
    • "Average โฌ†๏ธ",
    • "Textual Entailment",
    • "TextualEntailment Best",
    • "Sentiment Analysis",
    • "Sentiment Analysis_best",
    • "Hate Speech",
    • "Hate Speech_best",
    • "Admission Test",
    • "Admission Test_best",
    • "Word in Context",
    • "Word in Context_best",
    • "FAQ",
    • "FAQ_best",
    • "Lexical Substitution",
    • "Lexical Substitution_best",
    • "Summarization",
    • "Summarization_best",
    • "NER",
    • "NER_best",
    • "REL",
    • "REL_best",
    • "Type",
    • "Architecture",
    • "Precision",
    • "Hub License",
    • "#Params (B)",
    • "Hub โค๏ธ",
    • "Available on the hub",
    • "Model sha"
    ],
  • "data": [
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">microsoft/Phi-3.5-mini-instruct</a>",
      • 4965.1,
      • 7004,
      • 8150,
      • 4373,
      • 7059,
      • 6449,
      • 6917,
      • 4067,
      • 6280,
      • 5697,
      • 6741,
      • 4422,
      • 7905,
      • 1829,
      • 2094,
      • 2270,
      • 2340,
      • 6069,
      • 6135,
      • 1440,
      • 2061,
      • "",
      • "Phi3ForCausalLM",
      • "bfloat16",
      • "?",
      • 0,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "๐Ÿ”ถ",
      • "<a target="_blank" href="https://huggingface.co/arcee-ai/Llama-3.1-SuperNova-Lite" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">arcee-ai/Llama-3.1-SuperNova-Lite</a>",
      • 4655.05,
      • 6854,
      • 7450,
      • 6380,
      • 7712,
      • 5221,
      • 6380,
      • 4363,
      • 6860,
      • 4288,
      • 6658,
      • 5116,
      • 9052,
      • 2205,
      • 2429,
      • 2274,
      • 2278,
      • 1772,
      • 2021,
      • 1782,
      • 2006,
      • "fine-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "llama3",
      • 8.03,
      • 19,
      • true,
      • "main"
      ],
    • [
      • "๐ŸŸข",
      • "<a target="_blank" href="https://huggingface.co/Almawave/Velvet-14B (5-shot)" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">Almawave/Velvet-14B (5-shot)</a>",
      • 4617.05,
      • 7446,
      • 7675,
      • 6799,
      • 6893,
      • 6635,
      • 6897,
      • 4973,
      • 6120,
      • 1541,
      • 3769,
      • 4173,
      • 8055,
      • 921,
      • 945,
      • 3426,
      • 3488,
      • 4895,
      • 4873,
      • 1307,
      • 1510,
      • "pretrained",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 7.24,
      • 0,
      • false,
      • "main"
      ],
    • [
      • "๐Ÿ”ถ",
      • "<a target="_blank" href="https://huggingface.co/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA</a>",
      • 4562.6,
      • 5892,
      • 6910,
      • 6095,
      • 7204,
      • 3959,
      • 6632,
      • 4073,
      • 6220,
      • 5260,
      • 6657,
      • 4285,
      • 7182,
      • 1935,
      • 1937,
      • 2237,
      • 2271,
      • 3804,
      • 4757,
      • 1781,
      • 2161,
      • "fine-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.03,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "๐ŸŸข",
      • "<a target="_blank" href="https://huggingface.co/Almawave/Velvet-14B (0-shot)" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">Almawave/Velvet-14B (0-shot)</a>",
      • 4432,
      • 6763,
      • 7850,
      • 6049,
      • 6659,
      • 5593,
      • 6564,
      • 4240,
      • 5740,
      • 3791,
      • 6431,
      • 4547,
      • 8953,
      • 7,
      • 13,
      • 3109,
      • 3111,
      • 3148,
      • 4379,
      • 695,
      • 998,
      • "pretrained",
      • "?",
      • "bfloat16",
      • "apache-2.0",
      • 7.24,
      • 0,
      • false,
      • "main"
      ],
    • [
      • "๐ŸŸข",
      • "<a target="_blank" href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">mistralai/Mistral-7B-Instruct-v0.3</a>",
      • 4423.4,
      • 6083,
      • 7125,
      • 5871,
      • 6614,
      • 5586,
      • 6539,
      • 3923,
      • 5400,
      • 6107,
      • 6649,
      • 4763,
      • 8479,
      • 647,
      • 1026,
      • 2728,
      • 2843,
      • 1927,
      • 1946,
      • 2029,
      • 2183,
      • "pretrained",
      • "MistralForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 7.25,
      • 17,
      • true,
      • "main"
      ],
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/CohereForAI/aya-expanse-8b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">CohereForAI/aya-expanse-8b</a>",
      • 4293.95,
      • 6033,
      • 7500,
      • 6016,
      • 7062,
      • 6048,
      • 6297,
      • 3853,
      • 5780,
      • 4595,
      • 6619,
      • 3803,
      • 6633,
      • 1162,
      • 1592,
      • 1885,
      • 1921,
      • 3471,
      • 3900,
      • 752,
      • 957,
      • "",
      • "?",
      • "bfloat16",
      • "?",
      • 0,
      • 0,
      • false,
      • "main"
      ],
    • [
      • "๐ŸŸข",
      • "<a target="_blank" href="https://huggingface.co/ibm-granite/granite-3.1-8b-instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">ibm-granite/granite-3.1-8b-instruct</a>",
      • 4205.25,
      • 5108,
      • 6700,
      • 4782,
      • 6945,
      • 3951,
      • 6264,
      • 4197,
      • 5940,
      • 4200,
      • 6685,
      • 5129,
      • 9152,
      • 9,
      • 17,
      • 3004,
      • 3039,
      • 3205,
      • 3713,
      • 918,
      • 1147,
      • "pretrained",
      • "GraniteForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.17,
      • 124,
      • true,
      • "main"
      ],
    • [
      • "๐Ÿ”ถ",
      • "<a target="_blank" href="https://huggingface.co/occiglot/occiglot-7b-it-en-instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">occiglot/occiglot-7b-it-en-instruct</a>",
      • 4050.35,
      • 4992,
      • 5600,
      • 5187,
      • 6109,
      • 4849,
      • 6272,
      • 4280,
      • 5400,
      • 4286,
      • 6649,
      • 4289,
      • 8329,
      • 155,
      • 288,
      • 2550,
      • 2856,
      • 1053,
      • 1415,
      • 3203,
      • 3245,
      • "fine-tuned",
      • "MistralForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 7.24,
      • 5,
      • true,
      • "main"
      ],
    • [
      • "๐Ÿ”ถ",
      • "<a target="_blank" href="https://huggingface.co/FairMind/Llama-3-8B-4bit-UltraChat-Ita" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">FairMind/Llama-3-8B-4bit-UltraChat-Ita</a>",
      • 3836.2,
      • 5808,
      • 6425,
      • 4601,
      • 6730,
      • 4872,
      • 6338,
      • 3337,
      • 4920,
      • 6603,
      • 6658,
      • 3454,
      • 5561,
      • 0,
      • 0,
      • 2360,
      • 2435,
      • 2744,
      • 3189,
      • 262,
      • 427,
      • "fine-tuned",
      • "LlamaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 8.03,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "๐Ÿ”ถ",
      • "<a target="_blank" href="https://huggingface.co/sapienzanlp/Minerva-7B-instruct-v1.0" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">sapienzanlp/Minerva-7B-instruct-v1.0</a>",
      • 3373.4,
      • 5413,
      • 5650,
      • 4452,
      • 5946,
      • 3923,
      • 6048,
      • 2887,
      • 3400,
      • 5341,
      • 6604,
      • 3105,
      • 3766,
      • 0,
      • 0,
      • 1622,
      • 1636,
      • 1831,
      • 1931,
      • 1775,
      • 2138,
      • "fine-tuned",
      • "MistralForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 7.4,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "๐Ÿ”ถ",
      • "<a target="_blank" href="https://huggingface.co/MoxoffSpA/Volare" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">MoxoffSpA/Volare</a>",
      • 3206.2,
      • 5125,
      • 5550,
      • 4415,
      • 5849,
      • 4357,
      • 6270,
      • 2277,
      • 2740,
      • 2780,
      • 6640,
      • 2735,
      • 2818,
      • 2,
      • 4,
      • 2315,
      • 2332,
      • 2126,
      • 2157,
      • 1595,
      • 2037,
      • "fine-tuned",
      • "GemmaForCausalLM",
      • "bfloat16",
      • "apache-2.0",
      • 7.24,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/iGeniusAI/Italia-9B-Instruct-v0.1" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">iGeniusAI/Italia-9B-Instruct-v0.1</a>",
      • 3194.55,
      • 5933,
      • 7025,
      • 4841,
      • 5926,
      • 5890,
      • 6358,
      • 2613,
      • 3040,
      • 4992,
      • 6667,
      • 3126,
      • 3890,
      • 0,
      • 0,
      • 0,
      • 1,
      • 1507,
      • 2082,
      • 0,
      • 0,
      • "",
      • "?",
      • "bfloat16",
      • "?",
      • 0,
      • 0,
      • true,
      • "main"
      ]
    ],
  • "metadata": null
}