open-llm-leaderboard/details_42dot__42dot_LLM-PLM-1.3B
收藏数据集概述
数据集简介
该数据集是在对模型 42dot/42dot_LLM-PLM-1.3B 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 数据集包含 64 个配置,每个配置对应一个评估任务。
- 数据集从 2 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_42dot__42dot_LLM-PLM-1.3B_public", "harness_winogrande_5", split="train")
最新结果
以下是 2023-11-15T08:12:34.029868 运行 的最新结果:
python { "all": { "acc": 0.2748396034833794, "acc_stderr": 0.03133274597965432, "acc_norm": 0.2767290148254369, "acc_norm_stderr": 0.032124763692846635, "mc1": 0.23378212974296206, "mc1_stderr": 0.014816195991931578, "mc2": 0.38680931810418795, "mc2_stderr": 0.013939564847231014, "em": 0.001153523489932886, "em_stderr": 0.0003476179896857114, "f1": 0.04587562919463095, "f1_stderr": 0.0011468980714363175 }, "harness|arc:challenge|25": { "acc": 0.30119453924914674, "acc_stderr": 0.013406741767847627, "acc_norm": 0.3242320819112628, "acc_norm_stderr": 0.01367881039951882 }, "harness|hellaswag|10": { "acc": 0.4287990440151364, "acc_stderr": 0.0049389301432344514, "acc_norm": 0.563931487751444, "acc_norm_stderr": 0.004948824501355477 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.26, "acc_stderr": 0.04408440022768081, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768081 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.23703703703703705, "acc_stderr": 0.03673731683969506, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.18421052631578946, "acc_stderr": 0.0315469804508223, "acc_norm": 0.18421052631578946, "acc_norm_stderr": 0.0315469804508223 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.22, "acc_stderr": 0.041633319989322716, "acc_norm": 0.22, "acc_norm_stderr": 0.041633319989322716 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2679245283018868, "acc_stderr": 0.027257260322494845, "acc_norm": 0.2679245283018868, "acc_norm_stderr": 0.027257260322494845 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2708333333333333, "acc_stderr": 0.03716177437566016, "acc_norm": 0.2708333333333333, "acc_norm_stderr": 0.03716177437566016 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.27167630057803466, "acc_stderr": 0.03391750322321659, "acc_norm": 0.27167630057803466, "acc_norm_stderr": 0.03391750322321659 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.17647058823529413, "acc_stderr": 0.03793281185307809, "acc_norm": 0.17647058823529413, "acc_norm_stderr": 0.03793281185307809 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.31063829787234043, "acc_stderr": 0.03025123757921317, "acc_norm": 0.31063829787234043, "acc_norm_stderr": 0.03025123757921317 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2631578947368421, "acc_stderr": 0.04142439719489362, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.04142439719489362 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.18620689655172415, "acc_stderr": 0.03243946159004616, "acc_norm": 0.18620689655172415, "acc_norm_stderr": 0.03243946159004616 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.24074074074074073, "acc_stderr": 0.0220190800122179, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.0220190800122179 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.24603174603174602, "acc_stderr": 0.038522733649243156, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.038522733649243156 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.22580645161290322, "acc_stderr": 0.02378557788418101, "acc_norm": 0.22580645161290322, "acc_norm_stderr": 0.02378557788418101 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.270935960591133, "acc_stderr": 0.031270907132976984, "acc_norm": 0.270935960591133, "acc_norm_stderr": 0.031270907132976984 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.21818181818181817, "acc_stderr": 0.03225078108306289, "acc_norm": 0.21818181818181817, "acc_norm_stderr": 0.03225078108306289 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.2222222222222222, "acc_stderr": 0.02962022787479047, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.02962022787479047 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.3471502590673575, "acc_stderr": 0.034356961683613546, "acc_norm": 0.3471502590673575, "acc_norm_stderr": 0.034356961683613546 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.34615384615384615, "acc_stderr": 0.024121125416941183, "acc_norm": 0.34615384615



