open-llm-leaderboard-old/details_liminerity__dhbacmes-3b-slerp
收藏数据集概述
该数据集是在评估模型 liminerity/dhbacmes-3b-slerp 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 2 次运行中创建。每个运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_liminerity__dhbacmes-3b-slerp", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果从运行 2024-02-29T20:03:05.684922 的数据:
python { "all": { "acc": 0.5276648412815843, "acc_stderr": 0.03438428439388686, "acc_norm": 0.5310695726530207, "acc_norm_stderr": 0.03508696025146873, "mc1": 0.2607099143206854, "mc1_stderr": 0.015368841620766368, "mc2": 0.40412360273636533, "mc2_stderr": 0.014383564900315697 }, "harness|arc:challenge|25": { "acc": 0.4061433447098976, "acc_stderr": 0.014351656690097862, "acc_norm": 0.4522184300341297, "acc_norm_stderr": 0.014544519880633832 }, "harness|hellaswag|10": { "acc": 0.5204142601075483, "acc_stderr": 0.004985620773683433, "acc_norm": 0.7077275443138817, "acc_norm_stderr": 0.004538773493746559 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.43703703703703706, "acc_stderr": 0.04284958639753399, "acc_norm": 0.43703703703703706, "acc_norm_stderr": 0.04284958639753399 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5789473684210527, "acc_stderr": 0.040179012759817494, "acc_norm": 0.5789473684210527, "acc_norm_stderr": 0.040179012759817494 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5622641509433962, "acc_stderr": 0.030533338430467516, "acc_norm": 0.5622641509433962, "acc_norm_stderr": 0.030533338430467516 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6041666666666666, "acc_stderr": 0.04089465449325582, "acc_norm": 0.6041666666666666, "acc_norm_stderr": 0.04089465449325582 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620333, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5086705202312138, "acc_stderr": 0.0381189098894041, "acc_norm": 0.5086705202312138, "acc_norm_stderr": 0.0381189098894041 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.35294117647058826, "acc_stderr": 0.04755129616062946, "acc_norm": 0.35294117647058826, "acc_norm_stderr": 0.04755129616062946 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.40425531914893614, "acc_stderr": 0.03208115750788684, "acc_norm": 0.40425531914893614, "acc_norm_stderr": 0.03208115750788684 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.34210526315789475, "acc_stderr": 0.04462917535336936, "acc_norm": 0.34210526315789475, "acc_norm_stderr": 0.04462917535336936 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4689655172413793, "acc_stderr": 0.04158632762097828, "acc_norm": 0.4689655172413793, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.36243386243386244, "acc_stderr": 0.024757473902752052, "acc_norm": 0.36243386243386244, "acc_norm_stderr": 0.024757473902752052 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.30158730158730157, "acc_stderr": 0.041049472699033945, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.041049472699033945 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6451612903225806, "acc_stderr": 0.02721888977330876, "acc_norm": 0.6451612903225806, "acc_norm_stderr": 0.02721888977330876 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4482758620689655, "acc_stderr": 0.03499113137676744, "acc_norm": 0.4482758620689655, "acc_norm_stderr": 0.03499113137676744 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.5878787878787879, "acc_stderr": 0.03843566993588717, "acc_norm": 0.5878787878787879, "acc_norm_stderr": 0.03843566993588717 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.6464646464646465, "acc_stderr": 0.03406086723547155, "acc_norm": 0.6464646464646465, "acc_norm_stderr": 0.03406086723547155 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7202072538860104, "acc_stderr": 0.03239637046735704, "acc_norm": 0.7202072538860104, "acc_norm_stderr": 0.03239637046735704 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5, "acc_stderr": 0.02535100632816969, "acc_norm": 0.5, "acc_norm_stderr": 0.02535100632816969 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.32222222222222224, "acc_stderr": 0.028493465091028597, "acc_norm": 0.32222222222222224, "acc_norm_stderr": 0.028493465091028597



