open-llm-leaderboard-old/details_Radu1999__Mistral-Instruct-Ukrainian-SFT-DPO
收藏数据集概述
数据集组成
- 该数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示在Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Radu1999__Mistral-Instruct-Ukrainian-SFT-DPO", "harness_winogrande_5", split="train")
最新结果
- 这些是最新结果,来自2024-02-11T12:02:04.707768的运行(注意可能还有其他任务的结果,可以在results和每个评估的"latest"分割中找到):
python { "all": { "acc": 0.6083454936984162, "acc_stderr": 0.033140017189034275, "acc_norm": 0.6127945476017843, "acc_norm_stderr": 0.0338104933555728, "mc1": 0.40514075887392903, "mc1_stderr": 0.01718561172775337, "mc2": 0.5791139392635098, "mc2_stderr": 0.015266138543062658 }, "harness|arc:challenge|25": { "acc": 0.5665529010238908, "acc_stderr": 0.014481376224558903, "acc_norm": 0.6049488054607508, "acc_norm_stderr": 0.014285898292938163 }, "harness|hellaswag|10": { "acc": 0.6436964748058156, "acc_stderr": 0.004779276329704048, "acc_norm": 0.8383788090021908, "acc_norm_stderr": 0.0036735065123709547 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.047609522856952365, "acc_norm": 0.34, "acc_norm_stderr": 0.047609522856952365 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.631578947368421, "acc_stderr": 0.03925523381052932, "acc_norm": 0.631578947368421, "acc_norm_stderr": 0.03925523381052932 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.049236596391733084, "acc_norm": 0.6, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6830188679245283, "acc_stderr": 0.028637235639800893, "acc_norm": 0.6830188679245283, "acc_norm_stderr": 0.028637235639800893 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6944444444444444, "acc_stderr": 0.03852084696008534, "acc_norm": 0.6944444444444444, "acc_norm_stderr": 0.03852084696008534 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5780346820809249, "acc_stderr": 0.0376574669386515, "acc_norm": 0.5780346820809249, "acc_norm_stderr": 0.0376574669386515 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3627450980392157, "acc_stderr": 0.047840607041056527, "acc_norm": 0.3627450980392157, "acc_norm_stderr": 0.047840607041056527 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.548936170212766, "acc_stderr": 0.032529096196131965, "acc_norm": 0.548936170212766, "acc_norm_stderr": 0.032529096196131965 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.47368421052631576, "acc_stderr": 0.046970851366478626, "acc_norm": 0.47368421052631576, "acc_norm_stderr": 0.046970851366478626 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6206896551724138, "acc_stderr": 0.04043461861916747, "acc_norm": 0.6206896551724138, "acc_norm_stderr": 0.04043461861916747 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.38095238095238093, "acc_stderr": 0.0250107491161376, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.0250107491161376 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.38095238095238093, "acc_stderr": 0.04343525428949097, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.04343525428949097 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7064516129032258, "acc_stderr": 0.025906087021319295, "acc_norm": 0.7064516129032258, "acc_norm_stderr": 0.025906087021319295 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.49261083743842365, "acc_stderr": 0.035176035403610084, "acc_norm": 0.49261083743842365, "acc_norm_stderr": 0.035176035403610084 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.64, "acc_stderr": 0.048241815132442176, "acc_norm": 0.64, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.703030303030303, "acc_stderr": 0.0356796977226805, "acc_norm": 0.703030303030303, "acc_norm_stderr": 0.0356796977226805 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7424242424242424, "acc_stderr": 0.031156269519646826, "acc_norm": 0.7424242424242424, "acc_norm_stderr": 0.031156269519646826 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8652849740932642, "acc_stderr": 0.02463978909770944, "acc_norm": 0.8652849740932642, "acc_norm_stderr": 0.02463978909770944 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5717948717948718, "acc_stderr": 0.025088301454694827, "acc_norm": 0.5717948717948718, "acc_norm_stderr": 0.025088301454694827 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.32222222222222224, "acc_stderr": 0.028493465091028597, "acc_norm": 0.32222222222222224, "acc_norm_stderr": 0.028493465091028597 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6470588235294118, "acc_stderr": 0.031041941304059288, "acc_norm": 0.6470588235294118, "acc_norm_stderr": 0.031041941304059288 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.33774834437086093, "acc_stderr": 0.0386155754625517, "acc_norm": 0.33774834437086093, "acc_norm_stderr": 0.0386155754625517 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.7981651376146789, "acc_stderr": 0.017208579357787586, "acc_norm": 0.7981651376146789, "acc_norm_stderr": 0.017208579357787586 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.4722222222222222, "acc_stderr": 0.0340470532865388, "acc_norm": 0.4722222222222222, "acc_norm_stderr": 0.0340470532865388 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.7598039215686274, "acc_stderr": 0.02998373305591361, "acc_norm": 0.7598039215686274, "acc_norm_stderr": 0.02998373305591361 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7552742616033755, "acc_stderr": 0.027985699387036423, "acc_norm": 0.7552742616033755, "acc_norm_stderr": 0.027985699387036423 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6143497757847534, "acc_stderr": 0.03266842214289201, "acc_norm": 0.6143497757847534, "acc_norm_stderr": 0.03266842214289201 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7404580152671756, "acc_stderr": 0.03844876139785271, "acc_norm": 0.7404580152671756, "acc_norm_stderr": 0.03844876139785271 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7933884297520661, "acc_stderr": 0.036959801280988226, "acc_norm": 0.7933884297520661, "acc_norm_stderr": 0.036959801280988226 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.6944444444444444, "acc_stderr": 0.04453197507374984, "acc_norm": 0.6944444444444444, "acc_norm_stderr": 0.04453197507374984 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7239263803680982, "acc_stderr": 0.03512385283705048, "acc_norm": 0.7239263803680982, "acc_norm_stderr": 0.03512385283705048 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.45535714285714285, "acc_stderr": 0.047268355537191, "acc_norm": 0.45535714285714285, "acc_norm_stderr": 0.047268355537191 }, "harness|hendrycksTest-management|5": { "acc": 0.6990291262135923, "acc_stderr": 0.045416094465039504, "acc_norm": 0.6990291262135923, "acc_norm_stderr": 0.045416094465039504 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8547008547008547, "acc_stderr": 0.02308663508684141, "acc_norm": 0.8547008547008547, "acc_norm_stderr": 0.02308663508684141 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.7803320561941252, "acc_stderr": 0.014805384478371155, "acc_norm": 0.7803320561941252, "acc_norm_stderr": 0.014805384478371155 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.6878612716763006, "acc_stderr": 0.024946792225272314, "acc_norm": 0.6878612716763006, "acc_norm_stderr": 0.024946792225272314 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.3653631284916201, "acc_stderr": 0.01610483388014229, "acc_norm": 0.3653631284916201, "acc_norm_stderr": 0.01610483388014229 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.696078431372549, "acc_stderr": 0.02633661346904663, "acc_norm": 0.696078431372549, "acc_norm_stderr": 0.02633661346904663 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6784565916398714, "acc_stderr": 0.026527724079528872, "acc_norm": 0.6784565916398714, "acc_norm_stderr": 0.026527724079528872 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.6851851851851852, "acc_stderr": 0.025842248700902168, "acc_norm": 0.6851851851851852, "acc_norm_stderr": 0.025842248700902168 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.46099290780141844, "acc_stderr": 0.029736592526424438, "acc_norm": 0.46099290780141844, "acc_norm_stderr": 0.029736592526424438 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.44589308996088656, "acc_stderr": 0.012695244711379772, "acc_norm": 0.44589308996088656, "acc_norm_stderr": 0.012695244711379772 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.5955882352941176, "acc_stderr": 0.029812630701569743, "acc_norm": 0.5955882352941176, "acc_norm_stderr": 0.029812630701569743 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6111111111111112, "acc_stderr": 0.019722058939618068, "acc_norm": 0.6111111111111112, "acc_norm_stderr": 0.019722058939618068 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.7272727272727273, "acc_stderr": 0.04265792110940588, "acc_norm": 0.7272727272727273, "acc_norm_stderr": 0.04265792110940588 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7020408163265306, "acc_stderr": 0.029279567411065677, "acc_norm": 0.7020408163265306, "acc_norm_stderr": 0.029279567411065677 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8308457711442786, "acc_stderr": 0.026508590656233257, "acc_norm": 0.8308457711442786, "acc_norm_stderr": 0.026508590656233257 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.81, "acc_stderr": 0.03942772444036625, "acc_norm": 0.81, "acc_norm_stderr": 0.03942772444036625 }, "harness|hendrycksTest-virology|5": { "acc": 0.5, "acc_stderr": 0.03892494720807614, "acc_norm": 0.5, "acc_norm_stderr": 0.03892494720807614 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8245614035087719, "acc_stderr": 0.029170885500727668, "acc_norm": 0.8245614035087719, "acc_norm_stderr": 0.029170885500727668 }, "harness|truthfulqa:mc|0": { "mc1": 0.40514075887392903, "mc1_stderr": 0.01718561172775337, "mc2": 0.5791139392635098, "mc2_stderr": 0.015266138543062658 }, "harness|winogrande|5": { "acc": 0.7695343330702447, "acc_stderr": 0.011835872164836676 }, "harness|gsm8k|5": { "acc": 0.4177407126611069, "acc_stderr": 0.013584820638504832 } }



