open-llm-leaderboard-old/details_alnrg2arg__test2
收藏数据集概述
该数据集是在对模型 alnrg2arg/test2 进行评估运行期间自动创建的,评估结果发布在 Open LLM Leaderboard 上。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_alnrg2arg__test2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-15T11:22:11.663514 运行的最新结果:
python { "all": { "acc": 0.24636436924076846, "acc_stderr": 0.03057531615216942, "acc_norm": 0.24707158644894944, "acc_norm_stderr": 0.031385477138922584, "mc1": 0.2460220318237454, "mc1_stderr": 0.015077219200662571, "mc2": 0.5013831681930769, "mc2_stderr": 0.017248638043307455 }, "harness|arc:challenge|25": { "acc": 0.23293515358361774, "acc_stderr": 0.012352507042617408, "acc_norm": 0.2721843003412969, "acc_norm_stderr": 0.013006600406423706 }, "harness|hellaswag|10": { "acc": 0.2539334793865764, "acc_stderr": 0.0043437045123801, "acc_norm": 0.26249751045608444, "acc_norm_stderr": 0.004390923353200561 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.17, "acc_stderr": 0.0377525168068637, "acc_norm": 0.17, "acc_norm_stderr": 0.0377525168068637 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2, "acc_stderr": 0.03455473702325437, "acc_norm": 0.2, "acc_norm_stderr": 0.03455473702325437 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.2236842105263158, "acc_stderr": 0.03391160934343604, "acc_norm": 0.2236842105263158, "acc_norm_stderr": 0.03391160934343604 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2679245283018868, "acc_stderr": 0.027257260322494845, "acc_norm": 0.2679245283018868, "acc_norm_stderr": 0.027257260322494845 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.24305555555555555, "acc_stderr": 0.03586879280080341, "acc_norm": 0.24305555555555555, "acc_norm_stderr": 0.03586879280080341 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.27, "acc_stderr": 0.04461960433384741, "acc_norm": 0.27, "acc_norm_stderr": 0.04461960433384741 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2658959537572254, "acc_stderr": 0.0336876293225943, "acc_norm": 0.2658959537572254, "acc_norm_stderr": 0.0336876293225943 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.22549019607843138, "acc_stderr": 0.041583075330832865, "acc_norm": 0.22549019607843138, "acc_norm_stderr": 0.041583075330832865 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.2723404255319149, "acc_stderr": 0.029101290698386708, "acc_norm": 0.2723404255319149, "acc_norm_stderr": 0.029101290698386708 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2543859649122807, "acc_stderr": 0.040969851398436716, "acc_norm": 0.2543859649122807, "acc_norm_stderr": 0.040969851398436716 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2827586206896552, "acc_stderr": 0.03752833958003336, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.03752833958003336 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.23544973544973544, "acc_stderr": 0.021851509822031715, "acc_norm": 0.23544973544973544, "acc_norm_stderr": 0.021851509822031715 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.31746031746031744, "acc_norm_stderr": 0.04163453031302859 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.24516129032258063, "acc_stderr": 0.024472243840895518, "acc_norm": 0.24516129032258063, "acc_norm_stderr": 0.024472243840895518 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.26108374384236455, "acc_stderr": 0.03090379695211449, "acc_norm": 0.26108374384236455, "acc_norm_stderr": 0.03090379695211449 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.17, "acc_stderr": 0.03775251680686371, "acc_norm": 0.17, "acc_norm_stderr": 0.03775251680686371 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.20606060606060606, "acc_stderr": 0.03158415324047709, "acc_norm": 0.20606060606060606, "acc_norm_stderr": 0.03158415324047709 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.29292929292929293, "acc_stderr": 0.03242497958178817, "acc_norm": 0.29292929292929293, "acc_norm_stderr": 0.03242497958178817 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.2694300518134715, "acc_stderr": 0.03201867122877794, "acc_norm": 0.2694300518134715, "acc_norm_stderr": 0.03201867122877794 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2564102564102564, "acc_stderr": 0.02213908110397153, "acc_norm": 0.2564102564102564, "acc_norm_stderr": 0.02213908110397153 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.24074074074074073, "acc_stderr": 0.0260671592222758, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.0260671592222758 }, "harness|hendrycksTest-high_school



