open-llm-leaderboard-old/details_kyujinpy__Sakura-SOLRCA-Math-Instruct-DPO-v2
收藏数据集概述
数据集摘要
该数据集是在评估模型 kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_kyujinpy__Sakura-SOLRCA-Math-Instruct-DPO-v2", "harness_winogrande_5", split="train")
最新结果
以下是 2023-12-29T14:41:22.828314 运行的最新结果:
python { "all": { "acc": 0.6650401628251246, "acc_stderr": 0.03167161493090799, "acc_norm": 0.6659696738358851, "acc_norm_stderr": 0.0323143824893023, "mc1": 0.5691554467564259, "mc1_stderr": 0.01733527247533237, "mc2": 0.7215851762165506, "mc2_stderr": 0.014925941232169025 }, "harness|arc:challenge|25": { "acc": 0.6868600682593856, "acc_stderr": 0.0135526715436235, "acc_norm": 0.712457337883959, "acc_norm_stderr": 0.013226719056266127 }, "harness|hellaswag|10": { "acc": 0.7165903206532563, "acc_stderr": 0.004497325533959638, "acc_norm": 0.8851822346146186, "acc_norm_stderr": 0.0031815035060543226 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6222222222222222, "acc_stderr": 0.04188307537595853, "acc_norm": 0.6222222222222222, "acc_norm_stderr": 0.04188307537595853 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.756578947368421, "acc_stderr": 0.034923496688842384, "acc_norm": 0.756578947368421, "acc_norm_stderr": 0.034923496688842384 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.73, "acc_stderr": 0.04461960433384741, "acc_norm": 0.73, "acc_norm_stderr": 0.04461960433384741 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6792452830188679, "acc_stderr": 0.02872750295788027, "acc_norm": 0.6792452830188679, "acc_norm_stderr": 0.02872750295788027 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7638888888888888, "acc_stderr": 0.03551446610810826, "acc_norm": 0.7638888888888888, "acc_norm_stderr": 0.03551446610810826 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.05021167315686779, "acc_norm": 0.52, "acc_norm_stderr": 0.05021167315686779 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6763005780346821, "acc_stderr": 0.035676037996391706, "acc_norm": 0.6763005780346821, "acc_norm_stderr": 0.035676037996391706 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.37254901960784315, "acc_stderr": 0.04810840148082636, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.04810840148082636 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6297872340425532, "acc_stderr": 0.03156564682236786, "acc_norm": 0.6297872340425532, "acc_norm_stderr": 0.03156564682236786 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.49122807017543857, "acc_stderr": 0.04702880432049615, "acc_norm": 0.49122807017543857, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6275862068965518, "acc_stderr": 0.04028731532947558, "acc_norm": 0.6275862068965518, "acc_norm_stderr": 0.04028731532947558 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.49206349206349204, "acc_stderr": 0.02574806587167328, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.02574806587167328 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4444444444444444, "acc_stderr": 0.044444444444444495, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.044444444444444495 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8096774193548387, "acc_stderr": 0.022331707611823078, "acc_norm": 0.8096774193548387, "acc_norm_stderr": 0.022331707611823078 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.806060606060606, "acc_stderr": 0.03087414513656209, "acc_norm": 0.806060606060606, "acc_norm_stderr": 0.03087414513656209 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8686868686868687, "acc_stderr": 0.024063156416822516, "acc_norm": 0.8686868686868687, "acc_norm_stderr": 0.024063156416822516 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.021995311963644244, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.021995311963644244 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.023946724741563976, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.023946724741563976 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.02944316932303154,



