open-llm-leaderboard-old/details_aloobun__d-Qwen1.5-1.8B
收藏数据集概述
数据集简介
该数据集是在对模型 aloobun/d-Qwen1.5-1.8B 进行评估运行期间自动创建的,用于 Open LLM Leaderboard 上的评估。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 数据来源:数据集从1次运行中创建,每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- 最新结果:"train" 分割始终指向最新的结果。
- 聚合结果:一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_aloobun__d-Qwen1.5-1.8B", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-15T14:07:41.531918 运行的最新结果:
python { "all": { "acc": 0.37527714371350507, "acc_stderr": 0.03371033849354266, "acc_norm": 0.38002997511580094, "acc_norm_stderr": 0.034559861757406905, "mc1": 0.2607099143206854, "mc1_stderr": 0.015368841620766372, "mc2": 0.42886105172291805, "mc2_stderr": 0.014667248068114681 }, "harness|arc:challenge|25": { "acc": 0.2773037542662116, "acc_stderr": 0.013082095839059374, "acc_norm": 0.30887372013651876, "acc_norm_stderr": 0.013501770929344003 }, "harness|hellaswag|10": { "acc": 0.3850826528579964, "acc_stderr": 0.004856203374715461, "acc_norm": 0.49731129257120094, "acc_norm_stderr": 0.004989709267191028 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.3111111111111111, "acc_stderr": 0.03999262876617722, "acc_norm": 0.3111111111111111, "acc_norm_stderr": 0.03999262876617722 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.3092105263157895, "acc_stderr": 0.037610708698674805, "acc_norm": 0.3092105263157895, "acc_norm_stderr": 0.037610708698674805 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.41509433962264153, "acc_stderr": 0.03032594578928611, "acc_norm": 0.41509433962264153, "acc_norm_stderr": 0.03032594578928611 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.3402777777777778, "acc_stderr": 0.03962135573486219, "acc_norm": 0.3402777777777778, "acc_norm_stderr": 0.03962135573486219 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.22, "acc_stderr": 0.041633319989322695, "acc_norm": 0.22, "acc_norm_stderr": 0.041633319989322695 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.28, "acc_stderr": 0.04512608598542129, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542129 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3179190751445087, "acc_stderr": 0.0355068398916558, "acc_norm": 0.3179190751445087, "acc_norm_stderr": 0.0355068398916558 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237654, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.49, "acc_stderr": 0.05024183937956911, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.35319148936170214, "acc_stderr": 0.031245325202761926, "acc_norm": 0.35319148936170214, "acc_norm_stderr": 0.031245325202761926 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3157894736842105, "acc_stderr": 0.043727482902780064, "acc_norm": 0.3157894736842105, "acc_norm_stderr": 0.043727482902780064 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.3724137931034483, "acc_stderr": 0.04028731532947558, "acc_norm": 0.3724137931034483, "acc_norm_stderr": 0.04028731532947558 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3253968253968254, "acc_stderr": 0.02413015829976262, "acc_norm": 0.3253968253968254, "acc_norm_stderr": 0.02413015829976262 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.2777777777777778, "acc_stderr": 0.040061680838488774, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.040061680838488774 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.19, "acc_stderr": 0.039427724440366234, "acc_norm": 0.19, "acc_norm_stderr": 0.039427724440366234 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.3967741935483871, "acc_stderr": 0.027831231605767944, "acc_norm": 0.3967741935483871, "acc_norm_stderr": 0.027831231605767944 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2561576354679803, "acc_stderr": 0.030712730070982592, "acc_norm": 0.2561576354679803, "acc_norm_stderr": 0.030712730070982592 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.49696969696969695, "acc_stderr": 0.03904272341431857, "acc_norm": 0.49696969696969695, "acc_norm_stderr": 0.03904272341431857 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.42424242424242425, "acc_stderr": 0.03521224908841583, "acc_norm": 0.42424242424242425, "acc_norm_stderr": 0.03521224908841583 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.41450777202072536, "acc_stderr": 0.03555300319557673, "acc_norm": 0.41450777202072536, "acc_norm_stderr": 0.03555300319557673 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.32051282051282054, "acc_stderr": 0.023661296393964273, "acc_norm": 0.32051282051282054, "acc_norm_stderr": 0.023661296393964273 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.25555555555555554, "acc_stderr": 0.026593939101844058, "acc_norm": 0.255



