open-llm-leaderboard/details_uukuguy__speechless-llama2-13b
收藏数据集概述
数据集简介
该数据集是在评估模型 uukuguy/speechless-llama2-13b 在 Open LLM Leaderboard 上的自动创建的。数据集包含64个配置,每个配置对应一个评估任务。
数据集结构
数据集由5次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_uukuguy__speechless-llama2-13b", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果 的摘要: python { "all": { "acc": 0.5860683187201721, "acc_stderr": 0.033316954491979946, "acc_norm": 0.5913412721401082, "acc_norm_stderr": 0.0340008049750402, "mc1": 0.3880048959608323, "mc1_stderr": 0.017058761501347972, "mc2": 0.5565985023189125, "mc2_stderr": 0.015435738665954496 }, "harness|arc:challenge|25": { "acc": 0.5784982935153583, "acc_stderr": 0.014430197069326023, "acc_norm": 0.6203071672354948, "acc_norm_stderr": 0.014182119866974872 }, "harness|hellaswag|10": { "acc": 0.6212905795658236, "acc_stderr": 0.004840742206718088, "acc_norm": 0.8181637124078869, "acc_norm_stderr": 0.0038492126228151643 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5481481481481482, "acc_stderr": 0.042992689054808644, "acc_norm": 0.5481481481481482, "acc_norm_stderr": 0.042992689054808644 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5657894736842105, "acc_stderr": 0.0403356566784832, "acc_norm": 0.5657894736842105, "acc_norm_stderr": 0.0403356566784832 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.04960449637488583, "acc_norm": 0.58, "acc_norm_stderr": 0.04960449637488583 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.630188679245283, "acc_stderr": 0.02971142188010793, "acc_norm": 0.630188679245283, "acc_norm_stderr": 0.02971142188010793 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6458333333333334, "acc_stderr": 0.039994111357535424, "acc_norm": 0.6458333333333334, "acc_norm_stderr": 0.039994111357535424 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5491329479768786, "acc_stderr": 0.037940126746970296, "acc_norm": 0.5491329479768786, "acc_norm_stderr": 0.037940126746970296 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3235294117647059, "acc_stderr": 0.04655010411319616, "acc_norm": 0.3235294117647059, "acc_norm_stderr": 0.04655010411319616 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.49361702127659574, "acc_stderr": 0.032683358999363366, "acc_norm": 0.49361702127659574, "acc_norm_stderr": 0.032683358999363366 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2807017543859649, "acc_stderr": 0.042270544512322, "acc_norm": 0.2807017543859649, "acc_norm_stderr": 0.042270544512322 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5448275862068965, "acc_stderr": 0.04149886942192118, "acc_norm": 0.5448275862068965, "acc_norm_stderr": 0.04149886942192118 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.37566137566137564, "acc_stderr": 0.024942368931159798, "acc_norm": 0.37566137566137564, "acc_norm_stderr": 0.024942368931159798 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.373015873015873, "acc_stderr": 0.04325506042017087, "acc_norm": 0.373015873015873, "acc_norm_stderr": 0.04325506042017087 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252604 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6838709677419355, "acc_stderr": 0.026450874489042764, "acc_norm": 0.6838709677419355, "acc_norm_stderr": 0.026450874489042764 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4975369458128079, "acc_stderr": 0.03517945038691063, "acc_norm": 0.4975369458128079, "acc_norm_stderr": 0.03517945038691063 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.703030303030303, "acc_stderr": 0.03567969772268049, "acc_norm": 0.703030303030303, "acc_norm_stderr": 0.03567969772268049 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7575757575757576, "acc_stderr": 0.030532892233932026, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.030532892233932026 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8393782383419689, "acc_stderr": 0.02649905770139744, "acc_norm": 0.8393782383419689, "acc_norm_stderr": 0.02649905770139744 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6230769230769231, "acc_stderr": 0.024570975364225995, "acc_norm": 0.6230769230769231, "acc_norm_stderr": 0.024570975364225995 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3148148148148148, "acc_stderr": 0.02831753349606648, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.02831753349606648 }, "harness|hendrycksTest-high_school



