open-llm-leaderboard-old/details_walebadr__Mistral-7B-v0.1-DPO
收藏数据集概述
数据集摘要
该数据集是在对模型 walebadr/Mistral-7B-v0.1-DPO 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 2 次运行中创建,每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_walebadr__Mistral-7B-v0.1-DPO", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果来自 run 2024-01-13T18:02:23.868441 的摘要: python { "all": { "acc": 0.2513839168298002, "acc_stderr": 0.03077453939218842, "acc_norm": 0.2517964377923722, "acc_norm_stderr": 0.03159254911508562, "mc1": 0.24357405140758873, "mc1_stderr": 0.015026354824910782, "mc2": 0.4935990954197777, "mc2_stderr": 0.017220011527240037 }, "harness|arc:challenge|25": { "acc": 0.23464163822525597, "acc_stderr": 0.012383873560768673, "acc_norm": 0.2781569965870307, "acc_norm_stderr": 0.0130944699195388 }, "harness|hellaswag|10": { "acc": 0.2562238597888867, "acc_stderr": 0.004356547185847042, "acc_norm": 0.2622983469428401, "acc_norm_stderr": 0.004389849907040314 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.18, "acc_stderr": 0.03861229196653694, "acc_norm": 0.18, "acc_norm_stderr": 0.03861229196653694 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2222222222222222, "acc_stderr": 0.0359144408419697, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.0359144408419697 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.28289473684210525, "acc_stderr": 0.03665349695640767, "acc_norm": 0.28289473684210525, "acc_norm_stderr": 0.03665349695640767 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.23, "acc_stderr": 0.04229525846816503, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816503 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.22641509433962265, "acc_stderr": 0.025757559893106765, "acc_norm": 0.22641509433962265, "acc_norm_stderr": 0.025757559893106765 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.20833333333333334, "acc_stderr": 0.033961162058453336, "acc_norm": 0.20833333333333334, "acc_norm_stderr": 0.033961162058453336 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.27, "acc_stderr": 0.0446196043338474, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.24855491329479767, "acc_stderr": 0.03295304696818318, "acc_norm": 0.24855491329479767, "acc_norm_stderr": 0.03295304696818318 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.30392156862745096, "acc_stderr": 0.045766654032077636, "acc_norm": 0.30392156862745096, "acc_norm_stderr": 0.045766654032077636 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.23, "acc_stderr": 0.042295258468165065, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165065 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.33617021276595743, "acc_stderr": 0.030881618520676942, "acc_norm": 0.33617021276595743, "acc_norm_stderr": 0.030881618520676942 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2719298245614035, "acc_stderr": 0.04185774424022057, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.04185774424022057 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.22758620689655173, "acc_stderr": 0.03493950380131184, "acc_norm": 0.22758620689655173, "acc_norm_stderr": 0.03493950380131184 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.24603174603174602, "acc_stderr": 0.022182037202948368, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.022182037202948368 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3333333333333333, "acc_stderr": 0.04216370213557835, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.04216370213557835 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.25161290322580643, "acc_stderr": 0.024685979286239956, "acc_norm": 0.25161290322580643, "acc_norm_stderr": 0.024685979286239956 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2019704433497537, "acc_stderr": 0.028247350122180253, "acc_norm": 0.2019704433497537, "acc_norm_stderr": 0.028247350122180253 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.20606060606060606, "acc_stderr": 0.031584153240477086, "acc_norm": 0.20606060606060606, "acc_norm_stderr": 0.031584153240477086 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.2676767676767677, "acc_stderr": 0.03154449888270286, "acc_norm": 0.2676767676767677, "acc_norm_stderr": 0.03154449888270286 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.2694300518134715, "acc_stderr": 0.03201867122877794, "acc_norm": 0.2694300518134715, "acc_norm_stderr": 0.03201867122877794 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2641025641025641, "acc_stderr": 0.02235219373745327, "acc_norm": 0.2641025641025641, "acc_norm_stderr": 0.02235219373745327 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.23703703703703705, "acc_stderr": 0.025928876132



