open-llm-leaderboard-old/details_jan-hq__LlamaCorn-1.1B-Chat
收藏数据集概述
数据集简介
该数据集是在评估模型jan-hq/LlamaCorn-1.1B-Chat在Open LLM Leaderboard上的运行过程中自动创建的。
数据集结构
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示在Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_jan-hq__LlamaCorn-1.1B-Chat", "harness_winogrande_5", split="train")
最新结果
以下是2024-03-12T10:29:25.854017运行的最新结果: python { "all": { "acc": 0.29373569196097277, "acc_stderr": 0.032243803435004895, "acc_norm": 0.2960484401036193, "acc_norm_stderr": 0.03310756115855655, "mc1": 0.23378212974296206, "mc1_stderr": 0.014816195991931586, "mc2": 0.36855840909843307, "mc2_stderr": 0.013989365630749612 }, "harness|arc:challenge|25": { "acc": 0.318259385665529, "acc_stderr": 0.013611993916971451, "acc_norm": 0.3378839590443686, "acc_norm_stderr": 0.013822047922283516 }, "harness|hellaswag|10": { "acc": 0.4482174865564629, "acc_stderr": 0.0049629497842360445, "acc_norm": 0.5924118701453893, "acc_norm_stderr": 0.004903815885983271 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.23703703703703705, "acc_stderr": 0.03673731683969506, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.24342105263157895, "acc_stderr": 0.034923496688842384, "acc_norm": 0.24342105263157895, "acc_norm_stderr": 0.034923496688842384 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2943396226415094, "acc_stderr": 0.028049186315695245, "acc_norm": 0.2943396226415094, "acc_norm_stderr": 0.028049186315695245 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.25, "acc_stderr": 0.03621034121889507, "acc_norm": 0.25, "acc_norm_stderr": 0.03621034121889507 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.23699421965317918, "acc_stderr": 0.03242414757483098, "acc_norm": 0.23699421965317918, "acc_norm_stderr": 0.03242414757483098 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.24509803921568626, "acc_stderr": 0.042801058373643966, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.042801058373643966 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.32340425531914896, "acc_stderr": 0.030579442773610337, "acc_norm": 0.32340425531914896, "acc_norm_stderr": 0.030579442773610337 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2719298245614035, "acc_stderr": 0.04185774424022056, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.04185774424022056 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.296551724137931, "acc_stderr": 0.03806142687309994, "acc_norm": 0.296551724137931, "acc_norm_stderr": 0.03806142687309994 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2830687830687831, "acc_stderr": 0.023201392938194978, "acc_norm": 0.2830687830687831, "acc_norm_stderr": 0.023201392938194978 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.24603174603174602, "acc_stderr": 0.03852273364924316, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.03852273364924316 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.24, "acc_stderr": 0.04292346959909283, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.24193548387096775, "acc_stderr": 0.024362599693031083, "acc_norm": 0.24193548387096775, "acc_norm_stderr": 0.024362599693031083 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.23645320197044334, "acc_stderr": 0.029896114291733545, "acc_norm": 0.23645320197044334, "acc_norm_stderr": 0.029896114291733545 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.28, "acc_stderr": 0.04512608598542129, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542129 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.3515151515151515, "acc_stderr": 0.0372820699868265, "acc_norm": 0.3515151515151515, "acc_norm_stderr": 0.0372820699868265 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.2474747474747475, "acc_stderr": 0.03074630074212451, "acc_norm": 0.2474747474747475, "acc_norm_stderr": 0.03074630074212451 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.27461139896373055, "acc_stderr": 0.03221024508041153, "acc_norm": 0.27461139896373055, "acc_norm_stderr": 0.03221024508041153 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.26666666666666666, "acc_stderr": 0.022421273612923714, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.022421273612923714 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.23703703703703705, "acc_stderr": 0.025928876132766107, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.02592887613



