open-llm-leaderboard-old/details_Locutusque__Orca-2-13b-SFT_v5
收藏数据集概述
数据集简介
该数据集是在评估模型 Locutusque/Orca-2-13b-SFT_v5 在 Open LLM Leaderboard 上的自动创建的。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建来源:从1次运行中创建。每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- 训练分割:"train" 分割始终指向最新的结果。
- 结果配置:一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Locutusque__Orca-2-13b-SFT_v5", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自2023-12-16T19:07:23.375645的运行: python { "all": { "acc": 0.5961306056367279, "acc_stderr": 0.03296294817457722, "acc_norm": 0.6051075721265253, "acc_norm_stderr": 0.03374567474717863, "mc1": 0.36474908200734396, "mc1_stderr": 0.01685096106172012, "mc2": 0.5183588707836261, "mc2_stderr": 0.0149463233822155 }, "harness|arc:challenge|25": { "acc": 0.5546075085324232, "acc_stderr": 0.014523987638344078, "acc_norm": 0.5921501706484642, "acc_norm_stderr": 0.014361097288449708 }, "harness|hellaswag|10": { "acc": 0.6079466241784505, "acc_stderr": 0.004872107262082463, "acc_norm": 0.8009360685122485, "acc_norm_stderr": 0.003984801854418762 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6518518518518519, "acc_stderr": 0.041153246103369526, "acc_norm": 0.6518518518518519, "acc_norm_stderr": 0.041153246103369526 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.756578947368421, "acc_stderr": 0.034923496688842384, "acc_norm": 0.756578947368421, "acc_norm_stderr": 0.034923496688842384 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6188679245283019, "acc_stderr": 0.029890609686286634, "acc_norm": 0.6188679245283019, "acc_norm_stderr": 0.029890609686286634 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6805555555555556, "acc_stderr": 0.038990736873573344, "acc_norm": 0.6805555555555556, "acc_norm_stderr": 0.038990736873573344 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.4, "acc_stderr": 0.04923659639173309, "acc_norm": 0.4, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5375722543352601, "acc_stderr": 0.0380168510452446, "acc_norm": 0.5375722543352601, "acc_norm_stderr": 0.0380168510452446 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.04690650298201943, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.04690650298201943 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4851063829787234, "acc_stderr": 0.03267151848924777, "acc_norm": 0.4851063829787234, "acc_norm_stderr": 0.03267151848924777 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3157894736842105, "acc_stderr": 0.04372748290278007, "acc_norm": 0.3157894736842105, "acc_norm_stderr": 0.04372748290278007 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5724137931034483, "acc_stderr": 0.041227371113703316, "acc_norm": 0.5724137931034483, "acc_norm_stderr": 0.041227371113703316 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.36507936507936506, "acc_stderr": 0.024796060602699958, "acc_norm": 0.36507936507936506, "acc_norm_stderr": 0.024796060602699958 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.31746031746031744, "acc_norm_stderr": 0.04163453031302859 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7161290322580646, "acc_stderr": 0.02564938106302926, "acc_norm": 0.7161290322580646, "acc_norm_stderr": 0.02564938106302926 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4827586206896552, "acc_stderr": 0.035158955511656986, "acc_norm": 0.4827586206896552, "acc_norm_stderr": 0.035158955511656986 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.65, "acc_stderr": 0.04793724854411019, "acc_norm": 0.65, "acc_norm_stderr": 0.04793724854411019 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7515151515151515, "acc_stderr": 0.03374402644139404, "acc_norm": 0.7515151515151515, "acc_norm_stderr": 0.03374402644139404 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7525252525252525, "acc_stderr": 0.030746300742124495, "acc_norm": 0.7525252525252525, "acc_norm_stderr": 0.030746300742124495 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8393782383419689, "acc_stderr": 0.02649905770139744, "acc_norm": 0.8393782383419689, "acc_norm_stderr": 0.02649905770139744 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5666666666666667, "acc_stderr": 0.02512465352588513, "acc_norm": 0.5666666666666667, "acc_norm_stderr": 0.02512465352588513 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3296296296296296, "acc_stderr": 0.028661201116524586, "acc_norm": 0.3296296296296296, "acc_norm_stderr": 0.028661201116524586 }, "harness|hendrycksTest-high_school_microeconomics|5": { "



