open-llm-leaderboard-old/details_JunchengXie__zephyr-7b-beta-gpt-4-80k
收藏数据集概述
数据集简介
该数据集是在评估模型 JunchengXie/zephyr-7b-beta-gpt-4-80k 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_JunchengXie__zephyr-7b-beta-gpt-4-80k", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-27T23:53:43.783934 运行的最新结果:
python { "all": { "acc": 0.602574862684413, "acc_stderr": 0.03340581138036985, "acc_norm": 0.6098452767259759, "acc_norm_stderr": 0.0341057144449039, "mc1": 0.4186046511627907, "mc1_stderr": 0.01727001528447685, "mc2": 0.584043219666108, "mc2_stderr": 0.015523976744619418 }, "harness|arc:challenge|25": { "acc": 0.5802047781569966, "acc_stderr": 0.014422181226303026, "acc_norm": 0.6083617747440273, "acc_norm_stderr": 0.014264122124938213 }, "harness|hellaswag|10": { "acc": 0.5879306910973909, "acc_stderr": 0.004912015369160072, "acc_norm": 0.7907787293367855, "acc_norm_stderr": 0.004059213774735556 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6074074074074074, "acc_stderr": 0.04218506215368881, "acc_norm": 0.6074074074074074, "acc_norm_stderr": 0.04218506215368881 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6381578947368421, "acc_stderr": 0.03910525752849724, "acc_norm": 0.6381578947368421, "acc_norm_stderr": 0.03910525752849724 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6792452830188679, "acc_stderr": 0.028727502957880267, "acc_norm": 0.6792452830188679, "acc_norm_stderr": 0.028727502957880267 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7152777777777778, "acc_stderr": 0.037738099906869334, "acc_norm": 0.7152777777777778, "acc_norm_stderr": 0.037738099906869334 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.04878317312145633, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145633 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6127167630057804, "acc_stderr": 0.03714325906302065, "acc_norm": 0.6127167630057804, "acc_norm_stderr": 0.03714325906302065 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.43137254901960786, "acc_stderr": 0.04928099597287534, "acc_norm": 0.43137254901960786, "acc_norm_stderr": 0.04928099597287534 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.04512608598542129, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542129 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5191489361702127, "acc_stderr": 0.03266204299064678, "acc_norm": 0.5191489361702127, "acc_norm_stderr": 0.03266204299064678 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4298245614035088, "acc_stderr": 0.04657047260594962, "acc_norm": 0.4298245614035088, "acc_norm_stderr": 0.04657047260594962 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5448275862068965, "acc_stderr": 0.04149886942192117, "acc_norm": 0.5448275862068965, "acc_norm_stderr": 0.04149886942192117 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.38095238095238093, "acc_stderr": 0.0250107491161376, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.0250107491161376 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.373015873015873, "acc_stderr": 0.04325506042017086, "acc_norm": 0.373015873015873, "acc_norm_stderr": 0.04325506042017086 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7677419354838709, "acc_stderr": 0.024022256130308235, "acc_norm": 0.7677419354838709, "acc_norm_stderr": 0.024022256130308235 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.47783251231527096, "acc_stderr": 0.03514528562175008, "acc_norm": 0.47783251231527096, "acc_norm_stderr": 0.03514528562175008 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.64, "acc_stderr": 0.04824181513244218, "acc_norm": 0.64, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7212121212121212, "acc_stderr": 0.03501438706296781, "acc_norm": 0.7212121212121212, "acc_norm_stderr": 0.03501438706296781 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7272727272727273, "acc_stderr": 0.03173071239071724, "acc_norm": 0.7272727272727273, "acc_norm_stderr": 0.03173071239071724 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8341968911917098, "acc_stderr": 0.026839845022314415, "acc_norm": 0.8341968911917098, "acc_norm_stderr": 0.026839845022314415 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6, "acc_stderr": 0.024838811988033165, "acc_norm": 0.6, "acc_norm_stderr": 0.024838811988033165 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.35185185185185186, "acc_stderr": 0.029116617606083004, "acc_norm": 0.35185185185185186, "acc_norm_stderr": 0.02911



