open-llm-leaderboard-old/details_stabilityai__stablelm-2-12b-chat
收藏数据集概述
数据集简介
该数据集是在评估模型 stabilityai/stablelm-2-12b-chat 的过程中自动创建的,用于 Open LLM Leaderboard 的评估。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_stabilityai__stablelm-2-12b-chat", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-19T04:44:10.886217 运行的最新结果:
python { "all": { "acc": 0.6147613880045496, "acc_stderr": 0.03283664441150552, "acc_norm": 0.6154489904330765, "acc_norm_stderr": 0.03350311233608589, "mc1": 0.47368421052631576, "mc1_stderr": 0.017479241161975526, "mc2": 0.6201476275069401, "mc2_stderr": 0.016117413140109702 }, "harness|arc:challenge|25": { "acc": 0.6399317406143344, "acc_stderr": 0.014027516814585184, "acc_norm": 0.6484641638225256, "acc_norm_stderr": 0.013952413699600938 }, "harness|hellaswag|10": { "acc": 0.6932881896036646, "acc_stderr": 0.004601862807240197, "acc_norm": 0.8595897231627166, "acc_norm_stderr": 0.0034670217932838317 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.26, "acc_stderr": 0.04408440022768079, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.562962962962963, "acc_stderr": 0.04284958639753401, "acc_norm": 0.562962962962963, "acc_norm_stderr": 0.04284958639753401 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6052631578947368, "acc_stderr": 0.039777499346220734, "acc_norm": 0.6052631578947368, "acc_norm_stderr": 0.039777499346220734 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6754716981132075, "acc_stderr": 0.02881561571343211, "acc_norm": 0.6754716981132075, "acc_norm_stderr": 0.02881561571343211 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7013888888888888, "acc_stderr": 0.03827052357950756, "acc_norm": 0.7013888888888888, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5606936416184971, "acc_stderr": 0.037842719328874674, "acc_norm": 0.5606936416184971, "acc_norm_stderr": 0.037842719328874674 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.04690650298201943, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.04690650298201943 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5617021276595745, "acc_stderr": 0.032436186361081004, "acc_norm": 0.5617021276595745, "acc_norm_stderr": 0.032436186361081004 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.43859649122807015, "acc_stderr": 0.04668000738510455, "acc_norm": 0.43859649122807015, "acc_norm_stderr": 0.04668000738510455 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5655172413793104, "acc_stderr": 0.04130740879555497, "acc_norm": 0.5655172413793104, "acc_norm_stderr": 0.04130740879555497 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4074074074074074, "acc_stderr": 0.02530590624159063, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.02530590624159063 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.36507936507936506, "acc_stderr": 0.04306241259127153, "acc_norm": 0.36507936507936506, "acc_norm_stderr": 0.04306241259127153 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6903225806451613, "acc_stderr": 0.026302774983517418, "acc_norm": 0.6903225806451613, "acc_norm_stderr": 0.026302774983517418 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.46798029556650245, "acc_stderr": 0.035107665979592154, "acc_norm": 0.46798029556650245, "acc_norm_stderr": 0.035107665979592154 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7818181818181819, "acc_stderr": 0.032250781083062896, "acc_norm": 0.7818181818181819, "acc_norm_stderr": 0.032250781083062896 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7626262626262627, "acc_stderr": 0.030313710538198892, "acc_norm": 0.7626262626262627, "acc_norm_stderr": 0.030313710538198892 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8704663212435233, "acc_stderr": 0.02423353229775872, "acc_norm": 0.8704663212435233, "acc_norm_stderr": 0.02423353229775872 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5666666666666667, "acc_stderr": 0.025124653525885117, "acc_norm": 0.5666666666666667, "acc_norm_stderr": 0.025124653525885117 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.35555555555555557, "acc_stderr": 0.0291857149498574, "acc_norm": 0.35555555555555557, "acc_norm_stderr": 0.0291857149498



