open-llm-leaderboard-old/details_alexredna__Tukan-1.1B-Chat-reasoning-sft-COLA
收藏数据集概述
数据集摘要
该数据集是在对模型 alexredna/Tukan-1.1B-Chat-reasoning-sft-COLA 进行评估运行期间自动创建的,评估结果发布在 Open LLM Leaderboard 上。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_alexredna__Tukan-1.1B-Chat-reasoning-sft-COLA", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-22T23:29:11.286981 运行的最新结果:
python { "all": { "acc": 0.2548927738380926, "acc_stderr": 0.03072430220180239, "acc_norm": 0.25609104148058187, "acc_norm_stderr": 0.03148388351888373, "mc1": 0.2386780905752754, "mc1_stderr": 0.014922629695456418, "mc2": 0.3825474897236823, "mc2_stderr": 0.013853773787804245 }, "harness|arc:challenge|25": { "acc": 0.30887372013651876, "acc_stderr": 0.013501770929344004, "acc_norm": 0.3412969283276451, "acc_norm_stderr": 0.01385583128749772 }, "harness|hellaswag|10": { "acc": 0.4479187412865963, "acc_stderr": 0.004962638446395995, "acc_norm": 0.5977892850029874, "acc_norm_stderr": 0.004893418929918262 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.26, "acc_stderr": 0.04408440022768081, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768081 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.17777777777777778, "acc_stderr": 0.033027898599017176, "acc_norm": 0.17777777777777778, "acc_norm_stderr": 0.033027898599017176 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.19078947368421054, "acc_stderr": 0.03197565821032499, "acc_norm": 0.19078947368421054, "acc_norm_stderr": 0.03197565821032499 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.24150943396226415, "acc_stderr": 0.026341480371118362, "acc_norm": 0.24150943396226415, "acc_norm_stderr": 0.026341480371118362 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.24305555555555555, "acc_stderr": 0.0358687928008034, "acc_norm": 0.24305555555555555, "acc_norm_stderr": 0.0358687928008034 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.24, "acc_stderr": 0.042923469599092816, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.23, "acc_stderr": 0.042295258468165065, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165065 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.24277456647398843, "acc_stderr": 0.0326926380614177, "acc_norm": 0.24277456647398843, "acc_norm_stderr": 0.0326926380614177 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.19607843137254902, "acc_stderr": 0.03950581861179961, "acc_norm": 0.19607843137254902, "acc_norm_stderr": 0.03950581861179961 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.23829787234042554, "acc_stderr": 0.02785125297388979, "acc_norm": 0.23829787234042554, "acc_norm_stderr": 0.02785125297388979 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.22807017543859648, "acc_stderr": 0.03947152782669415, "acc_norm": 0.22807017543859648, "acc_norm_stderr": 0.03947152782669415 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2482758620689655, "acc_stderr": 0.03600105692727771, "acc_norm": 0.2482758620689655, "acc_norm_stderr": 0.03600105692727771 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2328042328042328, "acc_stderr": 0.02176596167215453, "acc_norm": 0.2328042328042328, "acc_norm_stderr": 0.02176596167215453 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.21428571428571427, "acc_stderr": 0.03670066451047181, "acc_norm": 0.21428571428571427, "acc_norm_stderr": 0.03670066451047181 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.1935483870967742, "acc_stderr": 0.022475258525536057, "acc_norm": 0.1935483870967742, "acc_norm_stderr": 0.022475258525536057 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.21182266009852216, "acc_stderr": 0.02874898368994106, "acc_norm": 0.21182266009852216, "acc_norm_stderr": 0.02874898368994106 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.24, "acc_stderr": 0.04292346959909284, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909284 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.23636363636363636, "acc_stderr": 0.03317505930009179, "acc_norm": 0.23636363636363636, "acc_norm_stderr": 0.03317505930009179 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.20202020202020202, "acc_stderr": 0.02860620428922988, "acc_norm": 0.20202020202020202, "acc_norm_stderr": 0.02860620428922988 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.24870466321243523, "acc_stderr": 0.03119584087770031, "acc_norm": 0.24870466321243523, "acc_norm_stderr": 0.03119584087770031 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.25384615384615383, "acc_stderr": 0.022066054378726257, "acc_norm": 0.25384615384615383, "acc_norm_stderr": 0.022066054378726257 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.25555555555555554,



