open-llm-leaderboard-old/details_rishiraj__oswald-4x7b
收藏数据集概述
数据集来源
该数据集是在评估模型 rishiraj/oswald-4x7b 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
数据集由 63 个配置组成,每个配置对应一个评估任务。数据集是从 1 次运行中创建的,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_rishiraj__oswald-4x7b", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2024-01-17T02:39:53.848483 的运行: python { "all": { "acc": 0.6475500676457981, "acc_stderr": 0.0320490839945228, "acc_norm": 0.6486532930138588, "acc_norm_stderr": 0.03269576154651724, "mc1": 0.39167686658506734, "mc1_stderr": 0.017087795881769625, "mc2": 0.5738963807470084, "mc2_stderr": 0.0154388424053569 }, "harness|arc:challenge|25": { "acc": 0.6168941979522184, "acc_stderr": 0.014206472661672876, "acc_norm": 0.6578498293515358, "acc_norm_stderr": 0.013864152159177278 }, "harness|hellaswag|10": { "acc": 0.6703843855805617, "acc_stderr": 0.004691128722535485, "acc_norm": 0.8529177454690301, "acc_norm_stderr": 0.0035346403488166734 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901409, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901409 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7105263157894737, "acc_stderr": 0.03690677986137283, "acc_norm": 0.7105263157894737, "acc_norm_stderr": 0.03690677986137283 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.63, "acc_stderr": 0.048523658709391, "acc_norm": 0.63, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7094339622641509, "acc_stderr": 0.02794321998933713, "acc_norm": 0.7094339622641509, "acc_norm_stderr": 0.02794321998933713 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7708333333333334, "acc_stderr": 0.03514697467862388, "acc_norm": 0.7708333333333334, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6127167630057804, "acc_stderr": 0.03714325906302065, "acc_norm": 0.6127167630057804, "acc_norm_stderr": 0.03714325906302065 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.048971049527263666, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.048971049527263666 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.78, "acc_stderr": 0.04163331998932261, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932261 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5702127659574469, "acc_stderr": 0.03236214467715564, "acc_norm": 0.5702127659574469, "acc_norm_stderr": 0.03236214467715564 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.047028804320496165, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.047028804320496165 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5448275862068965, "acc_stderr": 0.04149886942192117, "acc_norm": 0.5448275862068965, "acc_norm_stderr": 0.04149886942192117 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41798941798941797, "acc_stderr": 0.02540255550326091, "acc_norm": 0.41798941798941797, "acc_norm_stderr": 0.02540255550326091 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7903225806451613, "acc_stderr": 0.023157879349083522, "acc_norm": 0.7903225806451613, "acc_norm_stderr": 0.023157879349083522 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.67, "acc_stderr": 0.04725815626252607, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252607 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.793939393939394, "acc_stderr": 0.03158415324047711, "acc_norm": 0.793939393939394, "acc_norm_stderr": 0.03158415324047711 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7777777777777778, "acc_stderr": 0.02962022787479049, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.02962022787479049 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.02199531196364424, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.02199531196364424 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6512820512820513, "acc_stderr": 0.02416278028401772, "acc_norm": 0.6512820512820513, "acc_norm_stderr": 0.02416278028401772 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.02944316932303154, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.02944316932303154 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6974789915966386, "acc_stderr": 0.029



