open-llm-leaderboard-old/details_AetherResearch__Cerebrum-1.0-7b
收藏数据集概述
该数据集是在对模型 AetherResearch/Cerebrum-1.0-7b 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_AetherResearch__Cerebrum-1.0-7b", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-13T18:03:46.221567 运行的最新结果:
python { "all": { "acc": 0.6336455880539492, "acc_stderr": 0.03246442302129431, "acc_norm": 0.638874705874662, "acc_norm_stderr": 0.03311993561874775, "mc1": 0.3182374541003672, "mc1_stderr": 0.01630598864892061, "mc2": 0.464852982885189, "mc2_stderr": 0.01466839649123992 }, "harness|arc:challenge|25": { "acc": 0.5793515358361775, "acc_stderr": 0.0144262112525084, "acc_norm": 0.6160409556313993, "acc_norm_stderr": 0.01421244498065189 }, "harness|hellaswag|10": { "acc": 0.6452897829117705, "acc_stderr": 0.004774476498238617, "acc_norm": 0.8456482772356104, "acc_norm_stderr": 0.0036054721167622893 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6710526315789473, "acc_stderr": 0.038234289699266046, "acc_norm": 0.6710526315789473, "acc_norm_stderr": 0.038234289699266046 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.690566037735849, "acc_stderr": 0.028450154794118637, "acc_norm": 0.690566037735849, "acc_norm_stderr": 0.028450154794118637 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7013888888888888, "acc_stderr": 0.03827052357950756, "acc_norm": 0.7013888888888888, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.43137254901960786, "acc_stderr": 0.04928099597287534, "acc_norm": 0.43137254901960786, "acc_norm_stderr": 0.04928099597287534 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5404255319148936, "acc_stderr": 0.032579014820998356, "acc_norm": 0.5404255319148936, "acc_norm_stderr": 0.032579014820998356 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5175438596491229, "acc_stderr": 0.04700708033551038, "acc_norm": 0.5175438596491229, "acc_norm_stderr": 0.04700708033551038 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5862068965517241, "acc_stderr": 0.04104269211806232, "acc_norm": 0.5862068965517241, "acc_norm_stderr": 0.04104269211806232 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41005291005291006, "acc_stderr": 0.025331202438944444, "acc_norm": 0.41005291005291006, "acc_norm_stderr": 0.025331202438944444 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.373015873015873, "acc_stderr": 0.04325506042017087, "acc_norm": 0.373015873015873, "acc_norm_stderr": 0.04325506042017087 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7612903225806451, "acc_stderr": 0.024251071262208837, "acc_norm": 0.7612903225806451, "acc_norm_stderr": 0.024251071262208837 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5369458128078818, "acc_stderr": 0.035083705204426656, "acc_norm": 0.5369458128078818, "acc_norm_stderr": 0.035083705204426656 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7757575757575758, "acc_stderr": 0.03256866661681102, "acc_norm": 0.7757575757575758, "acc_norm_stderr": 0.03256866661681102 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7575757575757576, "acc_stderr": 0.030532892233932022, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.030532892233932022 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8704663212435233, "acc_stderr": 0.024233532297758733, "acc_norm": 0.8704663212435233, "acc_norm_stderr": 0.024233532297758733 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6435897435897436, "acc_stderr": 0.024283140529467305, "acc_norm": 0.6435897435897436, "acc_norm_stderr": 0.024283140529467305 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3296296296296296, "acc_stderr": 0.028661201116524565, "acc_norm": 0.3296296296296296, "acc_norm_stderr": 0.028661201116524565 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6302521008403361, "acc_stderr": 0.03135709599613591, "acc_norm": 0.6302521008403361, "acc_norm_stderr": 0.03135709599613591 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.3708609271523179, "acc_stderr": 0.03943966699183629, "acc_norm": 0.3708609271523179, "acc_norm_stderr": 0.03943966699183629 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8201834862385321, "acc_stderr": 0.016465345467391545, "acc_norm": 0.8201834862385321, "acc_norm_stderr": 0.016465345467391545 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5185185185185185, "acc_stderr": 0.03407632093854051, "acc_norm": 0.5185185185185185, "acc_norm_stderr": 0.03407632093854051 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.803921568627451, "acc_stderr": 0.027865942286639318, "acc_norm": 0.803921568627451, "acc_norm_stderr": 0.027865942286639318 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7637130801687764, "acc_stderr": 0.02765215314415927, "acc_norm": 0.7637130801687764, "acc_norm_stderr": 0.02765215314415927 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6905829596412556, "acc_stderr": 0.03102441174057221, "acc_norm": 0.6905829596412556, "acc_norm_stderr": 0.03102441174057221 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7709923664122137, "acc_stderr": 0.036853466317118506, "acc_norm": 0.7709923664122137, "acc_norm_stderr": 0.036853466317118506 }, "harness|hendrycksTest-international_law|5": { "acc": 0.768595041322314, "acc_stderr": 0.03849856098794088, "acc_norm": 0.768595041322314, "acc_norm_stderr": 0.03849856098794088 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7777777777777778, "acc_stderr": 0.040191074725573483, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.040191074725573483 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7914110429447853, "acc_stderr": 0.031921934489347235, "acc_norm": 0.7914110429447853, "acc_norm_stderr": 0.031921934489347235 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.45535714285714285, "acc_stderr": 0.047268355537191, "acc_norm": 0.45535714285714285, "acc_norm_stderr": 0.047268355537191 }, "harness|hendrycksTest-management|5": { "acc": 0.8155339805825242, "acc_stderr": 0.03840423627288276, "acc_norm": 0.8155339805825242, "acc_norm_stderr": 0.03840423627288276 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8589743589743589, "acc_stderr": 0.022801382534597524, "acc_norm": 0.8589743589743589, "acc_norm_stderr": 0.022801382534597524 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.73, "acc_stderr": 0.044619604333847394, "acc_norm": 0.73, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8186462324393359, "acc_stderr": 0.013778693778464076, "acc_norm": 0.8186462324393359, "acc_norm_stderr": 0.013778693778464076 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7196531791907514, "acc_stderr": 0.02418242749657761, "acc_norm": 0.7196531791907514, "acc_norm_stderr": 0.02418242749657761 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.27932960893854747, "acc_stderr": 0.015005762446786166, "acc_norm": 0.27932960893854747, "acc_norm_stderr": 0.015005762446786166 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7418300653594772, "acc_stderr": 0.025058503316958147, "acc_norm": 0.7418300653594772, "acc_norm_stderr": 0.025058503316958147 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6977491961414791, "acc_stderr": 0.026082700695399665, "acc_norm": 0.6977491961414791, "acc_norm_stderr": 0.026082700695399665 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7530864197530864, "acc_stderr": 0.023993501709042114, "acc_norm": 0.7530864197530864, "acc_norm_stderr": 0.023993501709042114 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.49645390070921985, "acc_stderr": 0.02982674915328092, "acc_norm": 0.49645390070921985, "acc_norm_stderr": 0.02982674915328092 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.455019556714472, "acc_stderr": 0.01271845661870177, "acc_norm": 0.455019556714472, "acc_norm_stderr": 0.01271845661870177 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6544117647058824, "acc_stderr": 0.028888193103988633, "acc_norm": 0.6544117647058824, "acc_norm_stderr": 0.028888193103988633 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6764705882352942, "acc_stderr": 0.018926082916083383, "acc_norm": 0.6764705882352942, "acc_norm_stderr": 0.018926082916083383 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6727272727272727, "acc_stderr": 0.0449429086625209, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.0449429086625209 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7224489795918367, "acc_stderr": 0.02866685779027465, "acc_norm": 0.7224489795918367, "acc_norm_stderr": 0.02866685779027465 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8258706467661692, "acc_stderr": 0.026814951200421603, "acc_norm": 0.8258706467661692, "acc_norm_stderr": 0.026814951200421603 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.84, "acc_stderr": 0.03684529491774711, "acc_norm": 0.84, "acc_norm_stderr": 0.03684529491774711 }, "harness|hendrycksTest-virology|5": { "acc": 0.5481927710843374, "acc_stderr": 0.03874371556587953, "acc_norm": 0.5481927710843374, "acc_norm_stderr": 0.03874371556587953 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8362573099415205, "acc_stderr": 0.028380919596145866, "acc_norm": 0.8362573099415205, "acc_norm_stderr": 0.028380919596145866 }, "harness|truthfulqa:mc|0": { "mc1": 0.3182374541003672, "mc1_stderr": 0.01630598864892061, "mc2": 0.464852982885189, "mc2_stderr": 0.01466839649123992 }, "harness|winogrande|5": { "acc": 0.7940015785319653, "acc_stderr": 0.011366474352008826 }, "harness|gsm8k|5": { "acc": 0.40181956027293403, "acc_stderr": 0.013504357787494035 } }



