open-llm-leaderboard-old/details_jondurbin__bagel-dpo-7b-v0.1
收藏数据集概述
该数据集是在对模型 jondurbin/bagel-dpo-7b-v0.1 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_jondurbin__bagel-dpo-7b-v0.1", "harness_winogrande_5", split="train")
最新结果
以下是 2023-12-13T21:40:44.143463 运行的最新结果:
python { "all": { "acc": 0.6422712088279074, "acc_stderr": 0.03234727019286789, "acc_norm": 0.6461514734611536, "acc_norm_stderr": 0.03299893056427725, "mc1": 0.4834761321909425, "mc1_stderr": 0.017493940190057723, "mc2": 0.6404580486077848, "mc2_stderr": 0.01530793190912482 }, "harness|arc:challenge|25": { "acc": 0.6331058020477816, "acc_stderr": 0.014084133118104296, "acc_norm": 0.6672354948805461, "acc_norm_stderr": 0.013769863046192302 }, "harness|hellaswag|10": { "acc": 0.6461860187213703, "acc_stderr": 0.004771751187407017, "acc_norm": 0.8415654252141008, "acc_norm_stderr": 0.0036440173837115923 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6148148148148148, "acc_stderr": 0.04203921040156279, "acc_norm": 0.6148148148148148, "acc_norm_stderr": 0.04203921040156279 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6842105263157895, "acc_stderr": 0.0378272898086547, "acc_norm": 0.6842105263157895, "acc_norm_stderr": 0.0378272898086547 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7094339622641509, "acc_stderr": 0.027943219989337128, "acc_norm": 0.7094339622641509, "acc_norm_stderr": 0.027943219989337128 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7430555555555556, "acc_stderr": 0.03653946969442099, "acc_norm": 0.7430555555555556, "acc_norm_stderr": 0.03653946969442099 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.630057803468208, "acc_stderr": 0.0368122963339432, "acc_norm": 0.630057803468208, "acc_norm_stderr": 0.0368122963339432 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.38235294117647056, "acc_stderr": 0.04835503696107223, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.04835503696107223 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5617021276595745, "acc_stderr": 0.03243618636108102, "acc_norm": 0.5617021276595745, "acc_norm_stderr": 0.03243618636108102 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.49122807017543857, "acc_stderr": 0.04702880432049615, "acc_norm": 0.49122807017543857, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.496551724137931, "acc_stderr": 0.041665675771015785, "acc_norm": 0.496551724137931, "acc_norm_stderr": 0.041665675771015785 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41005291005291006, "acc_stderr": 0.025331202438944423, "acc_norm": 0.41005291005291006, "acc_norm_stderr": 0.025331202438944423 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.49206349206349204, "acc_stderr": 0.044715725362943486, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.044715725362943486 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8, "acc_stderr": 0.022755204959542946, "acc_norm": 0.8, "acc_norm_stderr": 0.022755204959542946 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621504, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7818181818181819, "acc_stderr": 0.032250781083062896, "acc_norm": 0.7818181818181819, "acc_norm_stderr": 0.032250781083062896 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7828282828282829, "acc_stderr": 0.029376616484945633, "acc_norm": 0.7828282828282829, "acc_norm_stderr": 0.029376616484945633 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8549222797927462, "acc_stderr": 0.025416343096306422, "acc_norm": 0.8549222797927462, "acc_norm_stderr": 0.025416343096306422 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6666666666666666, "acc_stderr": 0.023901157979402534, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.023901157979402534 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3074074074074074, "acc_stderr": 0.02813325257881564, "acc_norm": 0.3074074074074074, "acc_norm_stderr": 0.02813325257881564 }, "harness|hendrycksTest-high_school_microeconom



