open-llm-leaderboard-old/details_Aryanne__TinyllamaMix-1.1B
收藏数据集概述
数据集摘要
该数据集是在对模型 Aryanne/TinyllamaMix-1.1B 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
最新结果
以下是 2024-03-04T03:25:32.908373 运行的最新结果:
python { "all": { "acc": 0.2543375578815742, "acc_stderr": 0.030751265952033546, "acc_norm": 0.25554275659800696, "acc_norm_stderr": 0.03152399342244661, "mc1": 0.19951040391676866, "mc1_stderr": 0.013989929967559654, "mc2": 0.33447140360446537, "mc2_stderr": 0.013781406786223522 }, "harness|arc:challenge|25": { "acc": 0.27047781569965873, "acc_stderr": 0.012980954547659554, "acc_norm": 0.3148464163822526, "acc_norm_stderr": 0.013572657703084948 }, "harness|hellaswag|10": { "acc": 0.3703445528779128, "acc_stderr": 0.00481910045686782, "acc_norm": 0.4838677554272057, "acc_norm_stderr": 0.00498718356079276 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.24444444444444444, "acc_stderr": 0.03712537833614866, "acc_norm": 0.24444444444444444, "acc_norm_stderr": 0.03712537833614866 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.19078947368421054, "acc_stderr": 0.031975658210325, "acc_norm": 0.19078947368421054, "acc_norm_stderr": 0.031975658210325 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.24, "acc_stderr": 0.04292346959909281, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909281 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.21509433962264152, "acc_stderr": 0.025288394502891363, "acc_norm": 0.21509433962264152, "acc_norm_stderr": 0.025288394502891363 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.18055555555555555, "acc_stderr": 0.032166008088022675, "acc_norm": 0.18055555555555555, "acc_norm_stderr": 0.032166008088022675 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.26, "acc_stderr": 0.044084400227680794, "acc_norm": 0.26, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.16, "acc_stderr": 0.0368452949177471, "acc_norm": 0.16, "acc_norm_stderr": 0.0368452949177471 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.23699421965317918, "acc_stderr": 0.03242414757483099, "acc_norm": 0.23699421965317918, "acc_norm_stderr": 0.03242414757483099 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.22549019607843138, "acc_stderr": 0.041583075330832865, "acc_norm": 0.22549019607843138, "acc_norm_stderr": 0.041583075330832865 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.28, "acc_stderr": 0.04512608598542129, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542129 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.2978723404255319, "acc_stderr": 0.029896145682095462, "acc_norm": 0.2978723404255319, "acc_norm_stderr": 0.029896145682095462 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2631578947368421, "acc_stderr": 0.04142439719489362, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.04142439719489362 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.23448275862068965, "acc_stderr": 0.035306258743465914, "acc_norm": 0.23448275862068965, "acc_norm_stderr": 0.035306258743465914 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.24603174603174602, "acc_stderr": 0.022182037202948368, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.022182037202948368 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.1984126984126984, "acc_stderr": 0.03567016675276864, "acc_norm": 0.1984126984126984, "acc_norm_stderr": 0.03567016675276864 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.2032258064516129, "acc_stderr": 0.02289168798455496, "acc_norm": 0.2032258064516129, "acc_norm_stderr": 0.02289168798455496 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2857142857142857, "acc_stderr": 0.03178529710642748, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.03178529710642748 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.20606060606060606, "acc_stderr": 0.031584153240477086, "acc_norm": 0.20606060606060606, "acc_norm_stderr": 0.031584153240477086 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.18686868686868688, "acc_stderr": 0.027772533334218967, "acc_norm": 0.18686868686868688, "acc_norm_stderr": 0.027772533334218967 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.23834196891191708, "acc_stderr": 0.030748905363909868, "acc_norm": 0.23834196891191708, "acc_norm_stderr": 0.030748905363909868 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2128205128205128, "acc_stderr": 0.020752423722128037, "acc_norm": 0.2128205128205128, "acc_norm_stderr": 0.020752423722128037 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2962962962962963, "acc_stderr": 0.027840811495871937, "acc_norm": 0.2962962962962963, "acc_norm_stderr": 0.027840811495871937 }, "harness|hendrycksTest-high_school_microeconomics|5": {



