open-llm-leaderboard-old/details_cognitivecomputations__dolphin-2.6-mistral-7b-dpo-laser
收藏数据集概述
该数据集是在评估模型 cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser 在 Open LLM Leaderboard 上的运行过程中自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
- 配置数量:63
- 创建来源:2 次运行
- 分割命名:每个运行的分割以其时间戳命名,"train" 分割始终指向最新结果。
- 额外配置:"results" 配置存储所有运行结果的聚合,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_cognitivecomputations__dolphin-2.6-mistral-7b-dpo-laser", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-06T08:55:09.441353 运行 的最新结果:
python { "all": { "acc": 0.6321651928198004, "acc_stderr": 0.03241329296366643, "acc_norm": 0.635985368424325, "acc_norm_stderr": 0.03305944195752434, "mc1": 0.4467564259485924, "mc1_stderr": 0.017403977522557144, "mc2": 0.6171088183728592, "mc2_stderr": 0.015045730588189423 }, "harness|arc:challenge|25": { "acc": 0.628839590443686, "acc_stderr": 0.01411797190114282, "acc_norm": 0.6629692832764505, "acc_norm_stderr": 0.013813476652902274 }, "harness|hellaswag|10": { "acc": 0.662617008564031, "acc_stderr": 0.0047185047710837655, "acc_norm": 0.8572993427604063, "acc_norm_stderr": 0.0034905249650619067 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.37, "acc_stderr": 0.04852365870939098, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939098 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6148148148148148, "acc_stderr": 0.04203921040156279, "acc_norm": 0.6148148148148148, "acc_norm_stderr": 0.04203921040156279 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6776315789473685, "acc_stderr": 0.03803510248351585, "acc_norm": 0.6776315789473685, "acc_norm_stderr": 0.03803510248351585 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.690566037735849, "acc_stderr": 0.028450154794118637, "acc_norm": 0.690566037735849, "acc_norm_stderr": 0.028450154794118637 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7361111111111112, "acc_stderr": 0.03685651095897532, "acc_norm": 0.7361111111111112, "acc_norm_stderr": 0.03685651095897532 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5953757225433526, "acc_stderr": 0.03742461193887248, "acc_norm": 0.5953757225433526, "acc_norm_stderr": 0.03742461193887248 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.39215686274509803, "acc_stderr": 0.048580835742663454, "acc_norm": 0.39215686274509803, "acc_norm_stderr": 0.048580835742663454 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.78, "acc_stderr": 0.04163331998932263, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932263 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5234042553191489, "acc_stderr": 0.03265019475033582, "acc_norm": 0.5234042553191489, "acc_norm_stderr": 0.03265019475033582 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.45614035087719296, "acc_stderr": 0.046854730419077895, "acc_norm": 0.45614035087719296, "acc_norm_stderr": 0.046854730419077895 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5724137931034483, "acc_stderr": 0.041227371113703316, "acc_norm": 0.5724137931034483, "acc_norm_stderr": 0.041227371113703316 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42592592592592593, "acc_stderr": 0.025467149045469553, "acc_norm": 0.42592592592592593, "acc_norm_stderr": 0.025467149045469553 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3968253968253968, "acc_stderr": 0.04375888492727061, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.04375888492727061 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7612903225806451, "acc_stderr": 0.02425107126220884, "acc_norm": 0.7612903225806451, "acc_norm_stderr": 0.02425107126220884 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5024630541871922, "acc_stderr": 0.03517945038691063, "acc_norm": 0.5024630541871922, "acc_norm_stderr": 0.03517945038691063 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7636363636363637, "acc_stderr": 0.03317505930009181, "acc_norm": 0.7636363636363637, "acc_norm_stderr": 0.03317505930009181 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7929292929292929, "acc_stderr": 0.028869778460267042, "acc_norm": 0.7929292929292929, "acc_norm_stderr": 0.028869778460267042 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8808290155440415, "acc_stderr": 0.023381935348121437, "acc_norm": 0.8808290155440415, "acc_norm_stderr": 0.023381935348121437 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6153846153846154, "acc_stderr": 0.02466674491518721, "acc_norm": 0.6153846153846154, "acc_norm_stderr": 0.02466674491518721 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2777777777777778, "acc_stderr": 0.02730914058823019, "acc_norm": 0.2777777777777




