| Model | ARC | HellaSwag | MMLU | TruthfulQA | Winogrande | GSM8K | Average |
|---|---|---|---|---|---|---|---|
| mera-mix-4x7B-v1 | 46.67 | 69.64 | 39.65 | 46.78 | 68.27 | 2.73 | 45.62 |
| Task | Version | Metric | Value | Stderr | |
|---|---|---|---|---|---|
| arc_challenge | 1 | acc,none | 0.44 | ||
| acc_stderr,none | 0.01 | ||||
| acc_norm,none | 0.47 | ||||
| acc_norm_stderr,none | 0.01 | ||||
| alias | arc_challenge |
Average: 46.67%
| Task | Version | Metric | Value | Stderr | |
|---|---|---|---|---|---|
| hellaswag | 1 | acc,none | 0.52 | ||
| acc_stderr,none | 0 | ||||
| acc_norm,none | 0.70 | ||||
| acc_norm_stderr,none | 0 | ||||
| alias | hellaswag |
Average: 69.64%
| Task | Version | Metric | Value | Stderr | |
|---|---|---|---|---|---|
| mmlu | N/A | acc,none | 0.4 | ||
| acc_stderr,none | 0 | ||||
| alias | mmlu | ||||
| mmlu_abstract_algebra | 0 | alias | - abstract_algebra | ||
| acc,none | 0.33 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_anatomy | 0 | alias | - anatomy | ||
| acc,none | 0.47 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_astronomy | 0 | alias | - astronomy | ||
| acc,none | 0.47 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_business_ethics | 0 | alias | - business_ethics | ||
| acc,none | 0.41 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_clinical_knowledge | 0 | alias | - clinical_knowledge | ||
| acc,none | 0.40 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_college_biology | 0 | alias | - college_biology | ||
| acc,none | 0.43 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_college_chemistry | 0 | alias | - college_chemistry | ||
| acc,none | 0.29 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_college_computer_science | 0 | alias | - college_computer_science | ||
| acc,none | 0.38 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_college_mathematics | 0 | alias | - college_mathematics | ||
| acc,none | 0.25 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_college_medicine | 0 | alias | - college_medicine | ||
| acc,none | 0.38 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_college_physics | 0 | alias | - college_physics | ||
| acc,none | 0.24 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_computer_security | 0 | alias | - computer_security | ||
| acc,none | 0.52 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_conceptual_physics | 0 | alias | - conceptual_physics | ||
| acc,none | 0.32 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_econometrics | 0 | alias | - econometrics | ||
| acc,none | 0.27 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_electrical_engineering | 0 | alias | - electrical_engineering | ||
| acc,none | 0.35 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_elementary_mathematics | 0 | alias | - elementary_mathematics | ||
| acc,none | 0.29 | ||||
| acc_stderr,none | 0.02 | ||||
| mmlu_formal_logic | 0 | alias | - formal_logic | ||
| acc,none | 0.21 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_global_facts | 0 | alias | - global_facts | ||
| acc,none | 0.27 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_high_school_biology | 0 | alias | - high_school_biology | ||
| acc,none | 0.49 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_high_school_chemistry | 0 | alias | - high_school_chemistry | ||
| acc,none | 0.32 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_high_school_computer_science | 0 | alias | - high_school_computer_science | ||
| acc,none | 0.46 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_high_school_european_history | 0 | alias | - high_school_european_history | ||
| acc,none | 0.49 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_high_school_geography | 0 | alias | - high_school_geography | ||
| acc,none | 0.63 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_high_school_government_and_politics | 0 | alias | - high_school_government_and_politics | ||
| acc,none | 0.53 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_high_school_macroeconomics | 0 | alias | - high_school_macroeconomics | ||
| acc,none | 0.37 | ||||
| acc_stderr,none | 0.02 | ||||
| mmlu_high_school_mathematics | 0 | alias | - high_school_mathematics | ||
| acc,none | 0.29 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_high_school_microeconomics | 0 | alias | - high_school_microeconomics | ||
| acc,none | 0.33 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_high_school_physics | 0 | alias | - high_school_physics | ||
| acc,none | 0.28 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_high_school_psychology | 0 | alias | - high_school_psychology | ||
| acc,none | 0.51 | ||||
| acc_stderr,none | 0.02 | ||||
| mmlu_high_school_statistics | 0 | alias | - high_school_statistics | ||
| acc,none | 0.30 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_high_school_us_history | 0 | alias | - high_school_us_history | ||
| acc,none | 0.50 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_high_school_world_history | 0 | alias | - high_school_world_history | ||
| acc,none | 0.49 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_human_aging | 0 | alias | - human_aging | ||
| acc,none | 0.43 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_human_sexuality | 0 | alias | - human_sexuality | ||
| acc,none | 0.46 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_humanities | N/A | alias | - humanities | ||
| acc,none | 0.38 | ||||
| acc_stderr,none | 0.01 | ||||
| mmlu_international_law | 0 | alias | - international_law | ||
| acc,none | 0.53 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_jurisprudence | 0 | alias | - jurisprudence | ||
| acc,none | 0.38 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_logical_fallacies | 0 | alias | - logical_fallacies | ||
| acc,none | 0.44 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_machine_learning | 0 | alias | - machine_learning | ||
| acc,none | 0.30 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_management | 0 | alias | - management | ||
| acc,none | 0.53 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_marketing | 0 | alias | - marketing | ||
| acc,none | 0.55 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_medical_genetics | 0 | alias | - medical_genetics | ||
| acc,none | 0.44 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_miscellaneous | 0 | alias | - miscellaneous | ||
| acc,none | 0.53 | ||||
| acc_stderr,none | 0.02 | ||||
| mmlu_moral_disputes | 0 | alias | - moral_disputes | ||
| acc,none | 0.42 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_moral_scenarios | 0 | alias | - moral_scenarios | ||
| acc,none | 0.30 | ||||
| acc_stderr,none | 0.02 | ||||
| mmlu_nutrition | 0 | alias | - nutrition | ||
| acc,none | 0.39 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_other | N/A | alias | - other | ||
| acc,none | 0.42 | ||||
| acc_stderr,none | 0.01 | ||||
| mmlu_philosophy | 0 | alias | - philosophy | ||
| acc,none | 0.42 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_prehistory | 0 | alias | - prehistory | ||
| acc,none | 0.45 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_professional_accounting | 0 | alias | - professional_accounting | ||
| acc,none | 0.29 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_professional_law | 0 | alias | - professional_law | ||
| acc,none | 0.31 | ||||
| acc_stderr,none | 0.01 | ||||
| mmlu_professional_medicine | 0 | alias | - professional_medicine | ||
| acc,none | 0.32 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_professional_psychology | 0 | alias | - professional_psychology | ||
| acc,none | 0.38 | ||||
| acc_stderr,none | 0.02 | ||||
| mmlu_public_relations | 0 | alias | - public_relations | ||
| acc,none | 0.44 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_security_studies | 0 | alias | - security_studies | ||
| acc,none | 0.39 | ||||
| acc_stderr,none | 0.03 | ||||
| mmlu_social_sciences | N/A | alias | - social_sciences | ||
| acc,none | 0.44 | ||||
| acc_stderr,none | 0.01 | ||||
| mmlu_sociology | 0 | alias | - sociology | ||
| acc,none | 0.57 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_stem | N/A | alias | - stem | ||
| acc,none | 0.35 | ||||
| acc_stderr,none | 0.01 | ||||
| mmlu_us_foreign_policy | 0 | alias | - us_foreign_policy | ||
| acc,none | 0.60 | ||||
| acc_stderr,none | 0.05 | ||||
| mmlu_virology | 0 | alias | - virology | ||
| acc,none | 0.31 | ||||
| acc_stderr,none | 0.04 | ||||
| mmlu_world_religions | 0 | alias | - world_religions | ||
| acc,none | 0.58 | ||||
| acc_stderr,none | 0.04 |
Average: 39.65%
| Task | Version | Metric | Value | Stderr | |
|---|---|---|---|---|---|
| truthfulqa | N/A | rouge1_max,none | 44.36 | ||
| rouge1_max_stderr,none | 0.84 | ||||
| rougeL_max,none | 40.99 | ||||
| rougeL_max_stderr,none | 0.85 | ||||
| rouge1_diff,none | -3.93 | ||||
| rouge1_diff_stderr,none | 0.76 | ||||
| bleu_acc,none | 0.35 | ||||
| bleu_acc_stderr,none | 0.02 | ||||
| rouge2_acc,none | 0.30 | ||||
| rouge2_acc_stderr,none | 0.02 | ||||
| rouge2_diff,none | -3.73 | ||||
| rouge2_diff_stderr,none | 0.89 | ||||
| bleu_diff,none | -1.73 | ||||
| bleu_diff_stderr,none | 0.64 | ||||
| rougeL_diff,none | -3.57 | ||||
| rougeL_diff_stderr,none | 0.77 | ||||
| acc,none | 0.38 | ||||
| acc_stderr,none | 0.01 | ||||
| bleu_max,none | 19.86 | ||||
| bleu_max_stderr,none | 0.72 | ||||
| rouge1_acc,none | 0.35 | ||||
| rouge1_acc_stderr,none | 0.02 | ||||
| rougeL_acc,none | 0.34 | ||||
| rougeL_acc_stderr,none | 0.02 | ||||
| rouge2_max,none | 28.47 | ||||
| rouge2_max_stderr,none | 0.92 | ||||
| alias | truthfulqa | ||||
| truthfulqa_gen | 3 | bleu_max,none | 19.86 | ||
| bleu_max_stderr,none | 0.72 | ||||
| bleu_acc,none | 0.35 | ||||
| bleu_acc_stderr,none | 0.02 | ||||
| bleu_diff,none | -1.73 | ||||
| bleu_diff_stderr,none | 0.64 | ||||
| rouge1_max,none | 44.36 | ||||
| rouge1_max_stderr,none | 0.84 | ||||
| rouge1_acc,none | 0.35 | ||||
| rouge1_acc_stderr,none | 0.02 | ||||
| rouge1_diff,none | -3.93 | ||||
| rouge1_diff_stderr,none | 0.76 | ||||
| rouge2_max,none | 28.47 | ||||
| rouge2_max_stderr,none | 0.92 | ||||
| rouge2_acc,none | 0.30 | ||||
| rouge2_acc_stderr,none | 0.02 | ||||
| rouge2_diff,none | -3.73 | ||||
| rouge2_diff_stderr,none | 0.89 | ||||
| rougeL_max,none | 40.99 | ||||
| rougeL_max_stderr,none | 0.85 | ||||
| rougeL_acc,none | 0.34 | ||||
| rougeL_acc_stderr,none | 0.02 | ||||
| rougeL_diff,none | -3.57 | ||||
| rougeL_diff_stderr,none | 0.77 | ||||
| alias | - truthfulqa_gen | ||||
| truthfulqa_mc1 | 2 | acc,none | 0.30 | ||
| acc_stderr,none | 0.02 | ||||
| alias | - truthfulqa_mc1 | ||||
| truthfulqa_mc2 | 2 | acc,none | 0.47 | ||
| acc_stderr,none | 0.02 | ||||
| alias | - truthfulqa_mc2 |
Average: 46.78%
| Task | Version | Metric | Value | Stderr | |
|---|---|---|---|---|---|
| winogrande | 1 | acc,none | 0.68 | ||
| acc_stderr,none | 0.01 | ||||
| alias | winogrande |
Average: 68.27%
| Task | Version | Metric | Value | Stderr | |
|---|---|---|---|---|---|
| gsm8k | 3 | exact_match,strict-match | 0.03 | ||
| exact_match_stderr,strict-match | 0 | ||||
| exact_match,flexible-extract | 0.06 | ||||
| exact_match_stderr,flexible-extract | 0.01 | ||||
| alias | gsm8k |
Average: 2.73%
Average score: 45.62%
Elapsed time: 06:55:37