open-llm-leaderboard-old/details_postbot__distilgpt2-emailgen-V2
收藏数据集概述
数据集来源
- 数据集是在模型 postbot/distilgpt2-emailgen-V2 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 数据集包含 64 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到一个特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_postbot__distilgpt2-emailgen-V2_public", "harness_winogrande_5", split="train")
最新结果
- 这些是最新结果,来自 2023-11-13T13:28:50.616028 的运行。
python { "all": { "acc": 0.2542066525769912, "acc_stderr": 0.030683618404772357, "acc_norm": 0.2547326716552163, "acc_norm_stderr": 0.031502030622377816, "mc1": 0.2717258261933905, "mc1_stderr": 0.015572840452875828, "mc2": 0.4651319733972654, "mc2_stderr": 0.016103347289806055, "em": 0.0, "em_stderr": 0.0, "f1": 0.003143875838926175, "f1_stderr": 0.00031171556932365637 }, "harness|arc:challenge|25": { "acc": 0.1689419795221843, "acc_stderr": 0.01094979565248503, "acc_norm": 0.2098976109215017, "acc_norm_stderr": 0.011900548748047442 }, "harness|hellaswag|10": { "acc": 0.26598287193786097, "acc_stderr": 0.004409521343140109, "acc_norm": 0.26777534355706034, "acc_norm_stderr": 0.004418948941099411 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.32592592592592595, "acc_stderr": 0.040491220417025055, "acc_norm": 0.32592592592592595, "acc_norm_stderr": 0.040491220417025055 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.16447368421052633, "acc_stderr": 0.03016753346863271, "acc_norm": 0.16447368421052633, "acc_norm_stderr": 0.03016753346863271 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.22641509433962265, "acc_stderr": 0.025757559893106744, "acc_norm": 0.22641509433962265, "acc_norm_stderr": 0.025757559893106744 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2222222222222222, "acc_stderr": 0.03476590104304134, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.03476590104304134 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.23121387283236994, "acc_stderr": 0.0321473730202947, "acc_norm": 0.23121387283236994, "acc_norm_stderr": 0.0321473730202947 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.2647058823529412, "acc_stderr": 0.043898699568087785, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.043898699568087785 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.15, "acc_stderr": 0.035887028128263714, "acc_norm": 0.15, "acc_norm_stderr": 0.035887028128263714 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.23829787234042554, "acc_stderr": 0.02785125297388979, "acc_norm": 0.23829787234042554, "acc_norm_stderr": 0.02785125297388979 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.24561403508771928, "acc_stderr": 0.04049339297748141, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.04049339297748141 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2206896551724138, "acc_stderr": 0.03455930201924811, "acc_norm": 0.2206896551724138, "acc_norm_stderr": 0.03455930201924811 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2566137566137566, "acc_stderr": 0.022494510767503154, "acc_norm": 0.2566137566137566, "acc_norm_stderr": 0.022494510767503154 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.15873015873015872, "acc_stderr": 0.03268454013011743, "acc_norm": 0.15873015873015872, "acc_norm_stderr": 0.03268454013011743 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.3161290322580645, "acc_stderr": 0.02645087448904277, "acc_norm": 0.3161290322580645, "acc_norm_stderr": 0.02645087448904277 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2955665024630542, "acc_stderr": 0.032104944337514575, "acc_norm": 0.2955665024630542, "acc_norm_stderr": 0.032104944337514575 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.32, "acc_stderr": 0.04688261722621505, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.20606060606060606, "acc_stderr": 0.03158415324047707, "acc_norm": 0.20606060606060606, "acc_norm_stderr": 0.03158415324047707 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.35858585858585856, "acc_stderr": 0.03416903640391521, "acc_norm": 0.35858585858585856, "acc_norm_stderr": 0.03416903640391521 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.22797927461139897, "acc_stderr": 0.030276909945178256, "acc_norm": 0.22797927461139897, "acc_norm_stderr": 0.030276909945178256 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2128205128205128, "acc_stderr": 0.020752423722128013, "acc_norm": 0.2128205128205128, "acc_norm_stderr": 0.020752423722128013 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.29259259259259257, "acc_stderr": 0.0277389696321



