open-llm-leaderboard-old/details_senseable__Wilbur-30B
收藏数据集概述
数据集简介
该数据集是在评估模型 senseable/Wilbur-30B 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集结构
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建。每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_senseable__Wilbur-30B", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-27T07:45:34.703302 运行的最新结果:
python { "all": { "acc": 0.7650338898352297, "acc_stderr": 0.028248683874528373, "acc_norm": 0.7682008360158653, "acc_norm_stderr": 0.028793309090233483, "mc1": 0.5263157894736842, "mc1_stderr": 0.017479241161975457, "mc2": 0.6996159108788989, "mc2_stderr": 0.014237498534320117 }, "harness|arc:challenge|25": { "acc": 0.7218430034129693, "acc_stderr": 0.0130944699195388, "acc_norm": 0.7406143344709898, "acc_norm_stderr": 0.012808273573927094 }, "harness|hellaswag|10": { "acc": 0.6719776936865166, "acc_stderr": 0.004685334844038661, "acc_norm": 0.866759609639514, "acc_norm_stderr": 0.003391398293613441 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.7481481481481481, "acc_stderr": 0.03749850709174021, "acc_norm": 0.7481481481481481, "acc_norm_stderr": 0.03749850709174021 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.881578947368421, "acc_stderr": 0.026293995855474945, "acc_norm": 0.881578947368421, "acc_norm_stderr": 0.026293995855474945 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.77, "acc_stderr": 0.04229525846816505, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816505 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.8113207547169812, "acc_stderr": 0.024079995130062253, "acc_norm": 0.8113207547169812, "acc_norm_stderr": 0.024079995130062253 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.9097222222222222, "acc_stderr": 0.023964965777906935, "acc_norm": 0.9097222222222222, "acc_norm_stderr": 0.023964965777906935 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.59, "acc_stderr": 0.04943110704237101, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237101 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7225433526011561, "acc_stderr": 0.03414014007044036, "acc_norm": 0.7225433526011561, "acc_norm_stderr": 0.03414014007044036 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5392156862745098, "acc_stderr": 0.04959859966384181, "acc_norm": 0.5392156862745098, "acc_norm_stderr": 0.04959859966384181 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.79, "acc_stderr": 0.04093601807403326, "acc_norm": 0.79, "acc_norm_stderr": 0.04093601807403326 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7617021276595745, "acc_stderr": 0.027851252973889774, "acc_norm": 0.7617021276595745, "acc_norm_stderr": 0.027851252973889774 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5701754385964912, "acc_stderr": 0.04657047260594964, "acc_norm": 0.5701754385964912, "acc_norm_stderr": 0.04657047260594964 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7586206896551724, "acc_stderr": 0.03565998174135302, "acc_norm": 0.7586206896551724, "acc_norm_stderr": 0.03565998174135302 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.7407407407407407, "acc_stderr": 0.022569897074918424, "acc_norm": 0.7407407407407407, "acc_norm_stderr": 0.022569897074918424 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5158730158730159, "acc_stderr": 0.044698818540726076, "acc_norm": 0.5158730158730159, "acc_norm_stderr": 0.044698818540726076 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.62, "acc_stderr": 0.04878317312145632, "acc_norm": 0.62, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.9032258064516129, "acc_stderr": 0.016818943416345197, "acc_norm": 0.9032258064516129, "acc_norm_stderr": 0.016818943416345197 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6847290640394089, "acc_stderr": 0.03269080871970186, "acc_norm": 0.6847290640394089, "acc_norm_stderr": 0.03269080871970186 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.77, "acc_stderr": 0.042295258468165044, "acc_norm": 0.77, "acc_norm_stderr": 0.042295258468165044 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8727272727272727, "acc_stderr": 0.026024657651656187, "acc_norm": 0.8727272727272727, "acc_norm_stderr": 0.026024657651656187 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9292929292929293, "acc_stderr": 0.018263105420199505, "acc_norm": 0.9292929292929293, "acc_norm_stderr": 0.018263105420199505 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9740932642487047, "acc_stderr": 0.011464523356953162, "acc_norm": 0.9740932642487047, "acc_norm_stderr": 0.011464523356953162 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.8102564102564103, "acc_stderr": 0.0198801654065888, "acc_norm": 0.8102564102564103, "acc_norm_stderr": 0.0198801654065888 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.45925925925925926, "acc_stderr": 0.03038416923235083, "acc_norm": 0.45925925925925926, "acc_norm_stderr": 0.03038416923235



