open-llm-leaderboard-old/details_ToastyPigeon__SmolLlama-1.5B-Bottomheavy
收藏数据集概述
数据集摘要
该数据集是在对模型 ToastyPigeon/SmolLlama-1.5B-Bottomheavy 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_ToastyPigeon__SmolLlama-1.5B-Bottomheavy", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-29T21:19:15.794054 运行的最新结果:
python { "all": { "acc": 0.2555090616515296, "acc_stderr": 0.030583579491884116, "acc_norm": 0.2569980595050836, "acc_norm_stderr": 0.03135581805562302, "mc1": 0.20930232558139536, "mc1_stderr": 0.01424121943478583, "mc2": 0.3500293686015524, "mc2_stderr": 0.014111093861772868 }, "harness|arc:challenge|25": { "acc": 0.30716723549488056, "acc_stderr": 0.013481034054980945, "acc_norm": 0.34215017064846415, "acc_norm_stderr": 0.013864152159177275 }, "harness|hellaswag|10": { "acc": 0.4500099581756622, "acc_stderr": 0.00496477980518066, "acc_norm": 0.595399322844055, "acc_norm_stderr": 0.00489811511097504 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.23, "acc_stderr": 0.04229525846816503, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816503 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.23703703703703705, "acc_stderr": 0.03673731683969506, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.19736842105263158, "acc_stderr": 0.03238981601699397, "acc_norm": 0.19736842105263158, "acc_norm_stderr": 0.03238981601699397 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2490566037735849, "acc_stderr": 0.026616482980501715, "acc_norm": 0.2490566037735849, "acc_norm_stderr": 0.026616482980501715 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.22916666666666666, "acc_stderr": 0.03514697467862388, "acc_norm": 0.22916666666666666, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.28, "acc_stderr": 0.04512608598542127, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.14, "acc_stderr": 0.03487350880197772, "acc_norm": 0.14, "acc_norm_stderr": 0.03487350880197772 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.24, "acc_stderr": 0.042923469599092816, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.19653179190751446, "acc_stderr": 0.03029957466478815, "acc_norm": 0.19653179190751446, "acc_norm_stderr": 0.03029957466478815 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.18627450980392157, "acc_stderr": 0.03873958714149351, "acc_norm": 0.18627450980392157, "acc_norm_stderr": 0.03873958714149351 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.15, "acc_stderr": 0.035887028128263714, "acc_norm": 0.15, "acc_norm_stderr": 0.035887028128263714 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.3148936170212766, "acc_stderr": 0.030363582197238167, "acc_norm": 0.3148936170212766, "acc_norm_stderr": 0.030363582197238167 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2894736842105263, "acc_stderr": 0.04266339443159394, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.04266339443159394 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2482758620689655, "acc_stderr": 0.03600105692727771, "acc_norm": 0.2482758620689655, "acc_norm_stderr": 0.03600105692727771 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2671957671957672, "acc_stderr": 0.022789673145776575, "acc_norm": 0.2671957671957672, "acc_norm_stderr": 0.022789673145776575 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.19047619047619047, "acc_stderr": 0.03512207412302052, "acc_norm": 0.19047619047619047, "acc_norm_stderr": 0.03512207412302052 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.27419354838709675, "acc_stderr": 0.025378139970885196, "acc_norm": 0.27419354838709675, "acc_norm_stderr": 0.025378139970885196 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.27586206896551724, "acc_stderr": 0.03144712581678241, "acc_norm": 0.27586206896551724, "acc_norm_stderr": 0.03144712581678241 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.2545454545454545, "acc_stderr": 0.03401506715249039, "acc_norm": 0.2545454545454545, "acc_norm_stderr": 0.03401506715249039 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.2828282828282828, "acc_stderr": 0.032087795587867514, "acc_norm": 0.2828282828282828, "acc_norm_stderr": 0.032087795587867514 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.25906735751295334, "acc_stderr": 0.031618779179354094, "acc_norm": 0.25906735751295334, "acc_norm_stderr": 0.031618779179354094 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.3153846153846154, "acc_stderr": 0.023559646983189946, "acc_norm": 0.3153846153846154, "acc_norm_stderr": 0.023559646983189946 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.25555555555555554, "acc_stderr": 0.026593939101844086,



