open-llm-leaderboard-old/details_nicholasKluge__Aira-124M
收藏数据集概述
数据集简介
该数据集是在对模型 nicholasKluge/Aira-124M 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_nicholasKluge__Aira-124M", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是 2023-08-29T19:04:35.532451 运行的最新结果:
python { "all": { "acc": 0.25265346552614076, "acc_stderr": 0.03117857003137413, "acc_norm": 0.253799928797708, "acc_norm_stderr": 0.03119563902907945, "mc1": 0.2460220318237454, "mc1_stderr": 0.01507721920066259, "mc2": 0.41020465472810524, "mc2_stderr": 0.015012374839842264 }, "harness|arc:challenge|25": { "acc": 0.19880546075085323, "acc_stderr": 0.01166285019817554, "acc_norm": 0.24573378839590443, "acc_norm_stderr": 0.012581033453730107 }, "harness|hellaswag|10": { "acc": 0.2921728739294961, "acc_stderr": 0.004538319464111971, "acc_norm": 0.312885879306911, "acc_norm_stderr": 0.004627207073171273 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2074074074074074, "acc_stderr": 0.03502553170678316, "acc_norm": 0.2074074074074074, "acc_norm_stderr": 0.03502553170678316 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.17763157894736842, "acc_stderr": 0.031103182383123398, "acc_norm": 0.17763157894736842, "acc_norm_stderr": 0.031103182383123398 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.2, "acc_stderr": 0.040201512610368445, "acc_norm": 0.2, "acc_norm_stderr": 0.040201512610368445 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.21132075471698114, "acc_stderr": 0.025125766484827845, "acc_norm": 0.21132075471698114, "acc_norm_stderr": 0.025125766484827845 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2569444444444444, "acc_stderr": 0.03653946969442099, "acc_norm": 0.2569444444444444, "acc_norm_stderr": 0.03653946969442099 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.19, "acc_stderr": 0.039427724440366234, "acc_norm": 0.19, "acc_norm_stderr": 0.039427724440366234 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.28, "acc_stderr": 0.045126085985421276, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.19, "acc_stderr": 0.03942772444036623, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036623 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.24277456647398843, "acc_stderr": 0.0326926380614177, "acc_norm": 0.24277456647398843, "acc_norm_stderr": 0.0326926380614177 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237654, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.26382978723404255, "acc_stderr": 0.028809989854102973, "acc_norm": 0.26382978723404255, "acc_norm_stderr": 0.028809989854102973 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2543859649122807, "acc_stderr": 0.04096985139843671, "acc_norm": 0.2543859649122807, "acc_norm_stderr": 0.04096985139843671 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2620689655172414, "acc_stderr": 0.036646663372252565, "acc_norm": 0.2620689655172414, "acc_norm_stderr": 0.036646663372252565 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.24867724867724866, "acc_stderr": 0.022261817692400168, "acc_norm": 0.24867724867724866, "acc_norm_stderr": 0.022261817692400168 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.16666666666666666, "acc_stderr": 0.03333333333333337, "acc_norm": 0.16666666666666666, "acc_norm_stderr": 0.03333333333333337 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.13, "acc_stderr": 0.03379976689896309, "acc_norm": 0.13, "acc_norm_stderr": 0.03379976689896309 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.21935483870967742, "acc_stderr": 0.023540799358723285, "acc_norm": 0.21935483870967742, "acc_norm_stderr": 0.023540799358723285 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.19704433497536947, "acc_stderr": 0.02798672466673622, "acc_norm": 0.19704433497536947, "acc_norm_stderr": 0.02798672466673622 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.21212121212121213, "acc_stderr": 0.03192271569548299, "acc_norm": 0.21212121212121213, "acc_norm_stderr": 0.03192271569548299 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.35353535353535354, "acc_stderr": 0.03406086723547153, "acc_norm": 0.35353535353535354, "acc_norm_stderr": 0.03406086723547153 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.36787564766839376, "acc_stderr": 0.03480175668466036, "acc_norm": 0.36787564766839376, "acc_norm_stderr": 0.03480175668466036 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.30256410256410254, "acc_stderr": 0.023290888053772725, "acc_norm": 0.30256410256410254, "acc_norm_stderr": 0.023290888053772725 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.22962962962962963, "acc_stderr": 0.02564410863926763,



