open-llm-leaderboard-old/details_Severian__ANIMA-Phi-Neptune-Mistral-7B-v1
收藏数据集概述
数据集摘要
该数据集是在对模型 Severian/ANIMA-Phi-Neptune-Mistral-7B-v1 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集由 61 个配置组成,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
最新结果
以下是 2023-10-10T14:57:20.867230 运行的最新结果:
python { "all": { "acc": 0.5221924256666464, "acc_stderr": 0.03497779761198706, "acc_norm": 0.5257525929962562, "acc_norm_stderr": 0.03496709701060229, "mc1": 0.4112607099143207, "mc1_stderr": 0.01722562708366086, "mc2": 0.5936287801538656, "mc2_stderr": 0.015090925037000012 }, "harness|arc:challenge|25": { "acc": 0.5, "acc_stderr": 0.014611390804670088, "acc_norm": 0.5290102389078498, "acc_norm_stderr": 0.01458677635529431 }, "harness|hellaswag|10": { "acc": 0.5657239593706433, "acc_stderr": 0.004946485466544624, "acc_norm": 0.7467635929097789, "acc_norm_stderr": 0.0043397644342190655 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4074074074074074, "acc_stderr": 0.042446332383532286, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.042446332383532286 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4868421052631579, "acc_stderr": 0.04067533136309174, "acc_norm": 0.4868421052631579, "acc_norm_stderr": 0.04067533136309174 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5849056603773585, "acc_stderr": 0.03032594578928611, "acc_norm": 0.5849056603773585, "acc_norm_stderr": 0.03032594578928611 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5277777777777778, "acc_stderr": 0.04174752578923185, "acc_norm": 0.5277777777777778, "acc_norm_stderr": 0.04174752578923185 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252604 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.45, "acc_stderr": 0.049999999999999996, "acc_norm": 0.45, "acc_norm_stderr": 0.049999999999999996 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.23, "acc_stderr": 0.042295258468165065, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165065 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5375722543352601, "acc_stderr": 0.0380168510452446, "acc_norm": 0.5375722543352601, "acc_norm_stderr": 0.0380168510452446 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.29411764705882354, "acc_stderr": 0.04533838195929777, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.04533838195929777 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4553191489361702, "acc_stderr": 0.03255525359340355, "acc_norm": 0.4553191489361702, "acc_norm_stderr": 0.03255525359340355 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.35964912280701755, "acc_stderr": 0.04514496132873633, "acc_norm": 0.35964912280701755, "acc_norm_stderr": 0.04514496132873633 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5586206896551724, "acc_stderr": 0.04137931034482758, "acc_norm": 0.5586206896551724, "acc_norm_stderr": 0.04137931034482758 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.35714285714285715, "acc_stderr": 0.024677862841332783, "acc_norm": 0.35714285714285715, "acc_norm_stderr": 0.024677862841332783 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.36507936507936506, "acc_stderr": 0.04306241259127153, "acc_norm": 0.36507936507936506, "acc_norm_stderr": 0.04306241259127153 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5870967741935483, "acc_stderr": 0.028009138125400387, "acc_norm": 0.5870967741935483, "acc_norm_stderr": 0.028009138125400387 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3497536945812808, "acc_stderr": 0.03355400904969565, "acc_norm": 0.3497536945812808, "acc_norm_stderr": 0.03355400904969565 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620333, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6727272727272727, "acc_stderr": 0.036639749943912434, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.036639749943912434 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.6212121212121212, "acc_stderr": 0.03456088731993747, "acc_norm": 0.6212121212121212, "acc_norm_stderr": 0.03456088731993747 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7202072538860104, "acc_stderr": 0.03239637046735704, "acc_norm": 0.7202072538860104, "acc_norm_stderr": 0.03239637046735704 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.4666666666666667, "acc_stderr": 0.025294608023986476, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.025294608023986476 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.27037037037037037, "acc_stderr": 0.027080372815145665, "acc_norm": 0.27037037037037037, "acc_norm_stderr": 0.027080372815145665 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.470588



