open-llm-leaderboard-old/details_gmonsoon__Qwenchana-0.5B-restart
收藏数据集概述
数据集摘要
该数据集是在模型 gmonsoon/Qwenchana-0.5B-restart 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 该数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 2 次运行中创建,每次运行可以在每个配置中找到一个特定的拆分,拆分名称使用运行的时间戳。
- "train" 拆分始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_gmonsoon__Qwenchana-0.5B-restart", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-03T08:24:22.530704 运行的最新结果:
python { "all": { "acc": 0.25895229357807475, "acc_stderr": 0.03102625874189923, "acc_norm": 0.2602863804038217, "acc_norm_stderr": 0.03178781024016605, "mc1": 0.24969400244798043, "mc1_stderr": 0.015152286907148128, "mc2": 0.404780510761619, "mc2_stderr": 0.014503353767789265 }, "harness|arc:challenge|25": { "acc": 0.2627986348122867, "acc_stderr": 0.012862523175351333, "acc_norm": 0.3003412969283277, "acc_norm_stderr": 0.01339590930995701 }, "harness|hellaswag|10": { "acc": 0.3679545907189803, "acc_stderr": 0.004812633280078256, "acc_norm": 0.45947022505477, "acc_norm_stderr": 0.004973361339169648 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.34074074074074073, "acc_stderr": 0.04094376269996794, "acc_norm": 0.34074074074074073, "acc_norm_stderr": 0.04094376269996794 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.3092105263157895, "acc_stderr": 0.03761070869867479, "acc_norm": 0.3092105263157895, "acc_norm_stderr": 0.03761070869867479 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2641509433962264, "acc_stderr": 0.02713429162874171, "acc_norm": 0.2641509433962264, "acc_norm_stderr": 0.02713429162874171 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2986111111111111, "acc_stderr": 0.03827052357950756, "acc_norm": 0.2986111111111111, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.21, "acc_stderr": 0.04093601807403326, "acc_norm": 0.21, "acc_norm_stderr": 0.04093601807403326 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.24, "acc_stderr": 0.04292346959909283, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2138728323699422, "acc_stderr": 0.031265112061730424, "acc_norm": 0.2138728323699422, "acc_norm_stderr": 0.031265112061730424 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.22549019607843138, "acc_stderr": 0.041583075330832865, "acc_norm": 0.22549019607843138, "acc_norm_stderr": 0.041583075330832865 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.25957446808510637, "acc_stderr": 0.028659179374292323, "acc_norm": 0.25957446808510637, "acc_norm_stderr": 0.028659179374292323 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.22807017543859648, "acc_stderr": 0.03947152782669415, "acc_norm": 0.22807017543859648, "acc_norm_stderr": 0.03947152782669415 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2896551724137931, "acc_stderr": 0.03780019230438014, "acc_norm": 0.2896551724137931, "acc_norm_stderr": 0.03780019230438014 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.26455026455026454, "acc_stderr": 0.022717467897708617, "acc_norm": 0.26455026455026454, "acc_norm_stderr": 0.022717467897708617 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.15079365079365079, "acc_stderr": 0.03200686497287392, "acc_norm": 0.15079365079365079, "acc_norm_stderr": 0.03200686497287392 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.18064516129032257, "acc_stderr": 0.021886178567172534, "acc_norm": 0.18064516129032257, "acc_norm_stderr": 0.021886178567172534 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2512315270935961, "acc_stderr": 0.030516530732694433, "acc_norm": 0.2512315270935961, "acc_norm_stderr": 0.030516530732694433 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.21212121212121213, "acc_stderr": 0.031922715695483, "acc_norm": 0.21212121212121213, "acc_norm_stderr": 0.031922715695483 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.18181818181818182, "acc_stderr": 0.027479603010538797, "acc_norm": 0.18181818181818182, "acc_norm_stderr": 0.027479603010538797 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.19689119170984457, "acc_stderr": 0.028697873971860664, "acc_norm": 0.19689119170984457, "acc_norm_stderr": 0.028697873971860664 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.21794871794871795, "acc_stderr": 0.020932445774463182, "acc_norm": 0.21794871794871795, "acc_norm_stderr": 0.020932445774463182 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.28888888888888886, "acc_stderr": 0.027634907264178544, "acc_



