open-llm-leaderboard-old/details_liminerity__Blur-7b-v1.2
收藏数据集概述
数据集描述
该数据集是在对模型 liminerity/Blur-7b-v1.2 进行评估运行期间自动创建的,评估运行在 Open LLM Leaderboard 上进行。
数据集组成
- 该数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_liminerity__Blur-7b-v1.2", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2024-01-18T13:00:27.961191 的运行: python { "all": { "acc": 0.6357129950975389, "acc_stderr": 0.03262066192131251, "acc_norm": 0.6382762311799055, "acc_norm_stderr": 0.03328259277014658, "mc1": 0.43451652386780903, "mc1_stderr": 0.017352738749259564, "mc2": 0.6030326315591199, "mc2_stderr": 0.015260409379504259 }, "harness|arc:challenge|25": { "acc": 0.6254266211604096, "acc_stderr": 0.01414419347189345, "acc_norm": 0.6535836177474402, "acc_norm_stderr": 0.013905011180063223 }, "harness|hellaswag|10": { "acc": 0.6528579964150567, "acc_stderr": 0.00475088440109516, "acc_norm": 0.8387771360286795, "acc_norm_stderr": 0.0036698484004877773 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6907894736842105, "acc_stderr": 0.037610708698674805, "acc_norm": 0.6907894736842105, "acc_norm_stderr": 0.037610708698674805 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7056603773584905, "acc_stderr": 0.02804918631569525, "acc_norm": 0.7056603773584905, "acc_norm_stderr": 0.02804918631569525 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7013888888888888, "acc_stderr": 0.03827052357950756, "acc_norm": 0.7013888888888888, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.53, "acc_stderr": 0.050161355804659205, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6184971098265896, "acc_stderr": 0.03703851193099521, "acc_norm": 0.6184971098265896, "acc_norm_stderr": 0.03703851193099521 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.43137254901960786, "acc_stderr": 0.04928099597287534, "acc_norm": 0.43137254901960786, "acc_norm_stderr": 0.04928099597287534 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.044084400227680794, "acc_norm": 0.74, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5702127659574469, "acc_stderr": 0.03236214467715564, "acc_norm": 0.5702127659574469, "acc_norm_stderr": 0.03236214467715564 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.45614035087719296, "acc_stderr": 0.046854730419077895, "acc_norm": 0.45614035087719296, "acc_norm_stderr": 0.046854730419077895 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.593103448275862, "acc_stderr": 0.04093793981266236, "acc_norm": 0.593103448275862, "acc_norm_stderr": 0.04093793981266236 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41534391534391535, "acc_stderr": 0.025379524910778405, "acc_norm": 0.41534391534391535, "acc_norm_stderr": 0.025379524910778405 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.40476190476190477, "acc_stderr": 0.04390259265377562, "acc_norm": 0.40476190476190477, "acc_norm_stderr": 0.04390259265377562 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7612903225806451, "acc_stderr": 0.02425107126220884, "acc_norm": 0.7612903225806451, "acc_norm_stderr": 0.02425107126220884 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.541871921182266, "acc_stderr": 0.03505630140785741, "acc_norm": 0.541871921182266, "acc_norm_stderr": 0.03505630140785741 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7757575757575758, "acc_stderr": 0.03256866661681102, "acc_norm": 0.7757575757575758, "acc_norm_stderr": 0.03256866661681102 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.803030303030303, "acc_stderr": 0.02833560973246336, "acc_norm": 0.803030303030303, "acc_norm_stderr": 0.02833560973246336 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8704663212435233, "acc_stderr": 0.024233532297758723, "acc_norm": 0.8704663212435233, "acc_norm_stderr": 0.024233532297758723 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6512820512820513, "acc_stderr": 0.02416278028401772, "acc_norm": 0.6512820512820513, "acc_norm_stderr": 0.02416278028401772 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3592592592592593, "acc_stderr": 0.029252905927251972, "acc_norm": 0.3592592592592593, "acc_norm_stderr": 0.029252905927251972 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.693277310



