open-llm-leaderboard-old/details_macadeliccc__SOLAR-10.7b-Instruct-dpo
收藏数据集概述
该数据集是在对模型 macadeliccc/SOLAR-10.7b-Instruct-dpo 进行评估运行时自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集来自 1 次运行,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_macadeliccc__SOLAR-10.7b-Instruct-dpo", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果来自 run 2024-01-25T05:53:40.914982 的摘要:
python { "all": { "acc": 0.6636898445574112, "acc_stderr": 0.031769837970977544, "acc_norm": 0.6652603795002481, "acc_norm_stderr": 0.03240911882592747, "mc1": 0.5642594859241126, "mc1_stderr": 0.01735834539886313, "mc2": 0.719754146671385, "mc2_stderr": 0.014988200007339842 }, "harness|arc:challenge|25": { "acc": 0.6885665529010239, "acc_stderr": 0.013532472099850945, "acc_norm": 0.7175767918088737, "acc_norm_stderr": 0.013155456884097222 }, "harness|hellaswag|10": { "acc": 0.7110137422824139, "acc_stderr": 0.00452365118401626, "acc_norm": 0.8808006373232424, "acc_norm_stderr": 0.003233607423889983 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.42, "acc_stderr": 0.04960449637488583, "acc_norm": 0.42, "acc_norm_stderr": 0.04960449637488583 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6148148148148148, "acc_stderr": 0.04203921040156279, "acc_norm": 0.6148148148148148, "acc_norm_stderr": 0.04203921040156279 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.743421052631579, "acc_stderr": 0.0355418036802569, "acc_norm": 0.743421052631579, "acc_norm_stderr": 0.0355418036802569 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.73, "acc_stderr": 0.04461960433384741, "acc_norm": 0.73, "acc_norm_stderr": 0.04461960433384741 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6867924528301886, "acc_stderr": 0.028544793319055326, "acc_norm": 0.6867924528301886, "acc_norm_stderr": 0.028544793319055326 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.03586879280080341, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.03586879280080341 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6820809248554913, "acc_stderr": 0.0355068398916558, "acc_norm": 0.6820809248554913, "acc_norm_stderr": 0.0355068398916558 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.37254901960784315, "acc_stderr": 0.04810840148082636, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.04810840148082636 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.73, "acc_stderr": 0.04461960433384739, "acc_norm": 0.73, "acc_norm_stderr": 0.04461960433384739 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6340425531914894, "acc_stderr": 0.031489558297455304, "acc_norm": 0.6340425531914894, "acc_norm_stderr": 0.031489558297455304 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.04702880432049615, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6137931034482759, "acc_stderr": 0.04057324734419035, "acc_norm": 0.6137931034482759, "acc_norm_stderr": 0.04057324734419035 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.47883597883597884, "acc_stderr": 0.025728230952130733, "acc_norm": 0.47883597883597884, "acc_norm_stderr": 0.025728230952130733 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42857142857142855, "acc_stderr": 0.0442626668137991, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.0442626668137991 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8129032258064516, "acc_stderr": 0.022185710092252255, "acc_norm": 0.8129032258064516, "acc_norm_stderr": 0.022185710092252255 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4975369458128079, "acc_stderr": 0.035179450386910616, "acc_norm": 0.4975369458128079, "acc_norm_stderr": 0.035179450386910616 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.806060606060606, "acc_stderr": 0.03087414513656208, "acc_norm": 0.806060606060606, "acc_norm_stderr": 0.03087414513656208 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8585858585858586, "acc_stderr": 0.02482590979334334, "acc_norm": 0.8585858585858586, "acc_norm_stderr": 0.02482590979334334 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.021995311963644244, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.021995311963644244 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6692307692307692, "acc_stderr": 0.02385479568097114, "acc_norm": 0.6692307692307692, "acc_norm_stderr": 0.02385479568097114 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.02944316932303154, "acc_norm": 0.37037037037037



