open-llm-leaderboard-old/details_cloudyu__19B_TRUTH_DPO
收藏数据集概述
该数据集是在对模型 cloudyu/19B_TRUTH_DPO 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_cloudyu__19B_TRUTH_DPO", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-02T05:24:23.880496 运行的最新结果:
python { "all": { "acc": 0.6604281536017871, "acc_stderr": 0.03188067819207853, "acc_norm": 0.6626612481630715, "acc_norm_stderr": 0.03252011256731915, "mc1": 0.5214198286413708, "mc1_stderr": 0.01748743214471164, "mc2": 0.7223001888589835, "mc2_stderr": 0.014760061606764314 }, "harness|arc:challenge|25": { "acc": 0.6979522184300341, "acc_stderr": 0.013417519144716417, "acc_norm": 0.7167235494880546, "acc_norm_stderr": 0.013167478735134575 }, "harness|hellaswag|10": { "acc": 0.7092212706632145, "acc_stderr": 0.0045319353915070065, "acc_norm": 0.8862776339374626, "acc_norm_stderr": 0.003168249351889306 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7368421052631579, "acc_stderr": 0.03583496176361072, "acc_norm": 0.7368421052631579, "acc_norm_stderr": 0.03583496176361072 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6792452830188679, "acc_stderr": 0.028727502957880267, "acc_norm": 0.6792452830188679, "acc_norm_stderr": 0.028727502957880267 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7708333333333334, "acc_stderr": 0.03514697467862388, "acc_norm": 0.7708333333333334, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726366, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726366 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6297872340425532, "acc_stderr": 0.03156564682236786, "acc_norm": 0.6297872340425532, "acc_norm_stderr": 0.03156564682236786 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.49122807017543857, "acc_stderr": 0.04702880432049615, "acc_norm": 0.49122807017543857, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6137931034482759, "acc_stderr": 0.04057324734419035, "acc_norm": 0.6137931034482759, "acc_norm_stderr": 0.04057324734419035 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.48677248677248675, "acc_stderr": 0.025742297289575142, "acc_norm": 0.48677248677248675, "acc_norm_stderr": 0.025742297289575142 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8032258064516129, "acc_stderr": 0.022616409420742025, "acc_norm": 0.8032258064516129, "acc_norm_stderr": 0.022616409420742025 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5369458128078818, "acc_stderr": 0.035083705204426656, "acc_norm": 0.5369458128078818, "acc_norm_stderr": 0.035083705204426656 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8, "acc_stderr": 0.031234752377721175, "acc_norm": 0.8, "acc_norm_stderr": 0.031234752377721175 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8484848484848485, "acc_stderr": 0.025545650426603627, "acc_norm": 0.8484848484848485, "acc_norm_stderr": 0.025545650426603627 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8911917098445595, "acc_stderr": 0.022473253332768766, "acc_norm": 0.8911917098445595, "acc_norm_stderr": 0.022473253332768766 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6717948717948717, "acc_stderr": 0.023807633198657266, "acc_norm": 0.6717948717948717, "acc_norm_stderr": 0.023807633198657266 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34074074074074073, "acc_stderr": 0.028897748741131137, "acc_norm": 0.34074074074074073, "acc_norm_stderr": 0.028897748741131137 }, "harness|hend



