open-llm-leaderboard-old/details_ewqr2130__alignment-handbook-zephyr-7b-sft-full-dpo-5e7-cont1
收藏数据集概述
数据集摘要
该数据集是在对模型 ewqr2130/alignment-handbook-zephyr-7b-sft-full-dpo-5e7-cont1 进行评估运行时自动创建的,用于 Open LLM Leaderboard。
数据集组成
数据集由 63 个配置组成,每个配置对应一个评估任务。数据集是从 1 次运行中创建的,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的 timestamp。"train" 分割总是指向最新的结果。
额外配置
一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
加载数据示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_ewqr2130__alignment-handbook-zephyr-7b-sft-full-dpo-5e7-cont1", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-15T02:49:27.291692 运行的最新结果:
python { "all": { "acc": 0.6032096253875614, "acc_stderr": 0.03321637816759657, "acc_norm": 0.6097201219482176, "acc_norm_stderr": 0.033909808173675136, "mc1": 0.27906976744186046, "mc1_stderr": 0.015702107090627904, "mc2": 0.40550458795616723, "mc2_stderr": 0.015282277248005289 }, "harness|arc:challenge|25": { "acc": 0.5614334470989761, "acc_stderr": 0.014500682618212865, "acc_norm": 0.6023890784982935, "acc_norm_stderr": 0.01430175222327954 }, "harness|hellaswag|10": { "acc": 0.6253734315873332, "acc_stderr": 0.00483037131784105, "acc_norm": 0.8228440549691296, "acc_norm_stderr": 0.003810203308901103 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5925925925925926, "acc_stderr": 0.04244633238353228, "acc_norm": 0.5925925925925926, "acc_norm_stderr": 0.04244633238353228 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.625, "acc_stderr": 0.039397364351956274, "acc_norm": 0.625, "acc_norm_stderr": 0.039397364351956274 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6641509433962264, "acc_stderr": 0.029067220146644826, "acc_norm": 0.6641509433962264, "acc_norm_stderr": 0.029067220146644826 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6805555555555556, "acc_stderr": 0.038990736873573344, "acc_norm": 0.6805555555555556, "acc_norm_stderr": 0.038990736873573344 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.4, "acc_stderr": 0.04923659639173309, "acc_norm": 0.4, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6184971098265896, "acc_stderr": 0.03703851193099521, "acc_norm": 0.6184971098265896, "acc_norm_stderr": 0.03703851193099521 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726366, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726366 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.78, "acc_stderr": 0.04163331998932261, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932261 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5446808510638298, "acc_stderr": 0.03255525359340355, "acc_norm": 0.5446808510638298, "acc_norm_stderr": 0.03255525359340355 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.40350877192982454, "acc_stderr": 0.046151869625837026, "acc_norm": 0.40350877192982454, "acc_norm_stderr": 0.046151869625837026 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5793103448275863, "acc_stderr": 0.04113914981189261, "acc_norm": 0.5793103448275863, "acc_norm_stderr": 0.04113914981189261 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3941798941798942, "acc_stderr": 0.025167982333894143, "acc_norm": 0.3941798941798942, "acc_norm_stderr": 0.025167982333894143 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.373015873015873, "acc_stderr": 0.04325506042017086, "acc_norm": 0.373015873015873, "acc_norm_stderr": 0.04325506042017086 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.04943110704237102, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7225806451612903, "acc_stderr": 0.025470196835900055, "acc_norm": 0.7225806451612903, "acc_norm_stderr": 0.025470196835900055 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5172413793103449, "acc_stderr": 0.035158955511656986, "acc_norm": 0.5172413793103449, "acc_norm_stderr": 0.035158955511656986 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621504, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7212121212121212, "acc_stderr": 0.035014387062967806, "acc_norm": 0.7212121212121212, "acc_norm_stderr": 0.035014387062967806 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7424242424242424, "acc_stderr": 0.03115626951964684, "acc_norm": 0.7424242424242424, "acc_norm_stderr": 0.03115626951964684 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8031088082901554, "acc_stderr": 0.028697873971860677, "acc_norm": 0.8031088082901554, "acc_norm_stderr": 0.028697873971860677 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5897435897435898, "acc_stderr": 0.0249393139069408, "acc_norm": 0.5897435897435898, "acc_norm_stderr": 0.0249393139069408 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3296296296296296, "acc_stderr": 0.028661201116524572, "acc_norm": 0.3296296296296296,



