open-llm-leaderboard-old/details_Deathsquad10__TinyLlama-repeat
收藏数据集概述
数据集简介
该数据集是在评估模型 Deathsquad10/TinyLlama-repeat 在 Open LLM Leaderboard 上的自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
数据集由 2 次运行创建,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。"train" 拆分始终指向最新的结果。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Deathsquad10__TinyLlama-repeat", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果 的摘要:
python { "all": { "acc": 0.2668612836089348, "acc_stderr": 0.031138191401929655, "acc_norm": 0.26805657616203815, "acc_norm_stderr": 0.03189989577745546, "mc1": 0.23378212974296206, "mc1_stderr": 0.01481619599193158, "mc2": 0.387823193354249, "mc2_stderr": 0.01401744646877844 }, "harness|arc:challenge|25": { "acc": 0.34215017064846415, "acc_stderr": 0.01386415215917728, "acc_norm": 0.35238907849829354, "acc_norm_stderr": 0.013960142600598682 }, "harness|hellaswag|10": { "acc": 0.4538936466839275, "acc_stderr": 0.004968521608065469, "acc_norm": 0.6024696275642303, "acc_norm_stderr": 0.004883871774350606 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816505 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2222222222222222, "acc_stderr": 0.03591444084196968, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.03591444084196968 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.18421052631578946, "acc_stderr": 0.0315469804508223, "acc_norm": 0.18421052631578946, "acc_norm_stderr": 0.0315469804508223 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.22, "acc_stderr": 0.041633319989322695, "acc_norm": 0.22, "acc_norm_stderr": 0.041633319989322695 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.27547169811320754, "acc_stderr": 0.027495663683724064, "acc_norm": 0.27547169811320754, "acc_norm_stderr": 0.027495663683724064 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2847222222222222, "acc_stderr": 0.03773809990686936, "acc_norm": 0.2847222222222222, "acc_norm_stderr": 0.03773809990686936 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.23699421965317918, "acc_stderr": 0.03242414757483099, "acc_norm": 0.23699421965317918, "acc_norm_stderr": 0.03242414757483099 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237656, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237656 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.26, "acc_stderr": 0.044084400227680794, "acc_norm": 0.26, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.2765957446808511, "acc_stderr": 0.02924188386962882, "acc_norm": 0.2765957446808511, "acc_norm_stderr": 0.02924188386962882 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.20175438596491227, "acc_stderr": 0.03775205013583639, "acc_norm": 0.20175438596491227, "acc_norm_stderr": 0.03775205013583639 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.21379310344827587, "acc_stderr": 0.034165204477475494, "acc_norm": 0.21379310344827587, "acc_norm_stderr": 0.034165204477475494 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.26455026455026454, "acc_stderr": 0.022717467897708614, "acc_norm": 0.26455026455026454, "acc_norm_stderr": 0.022717467897708614 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.2857142857142857, "acc_stderr": 0.0404061017820884, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.0404061017820884 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.23548387096774193, "acc_stderr": 0.024137632429337707, "acc_norm": 0.23548387096774193, "acc_norm_stderr": 0.024137632429337707 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.24630541871921183, "acc_stderr": 0.03031509928561774, "acc_norm": 0.24630541871921183, "acc_norm_stderr": 0.03031509928561774 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.3090909090909091, "acc_stderr": 0.03608541011573967, "acc_norm": 0.3090909090909091, "acc_norm_stderr": 0.03608541011573967 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.2222222222222222, "acc_stderr": 0.029620227874790486, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.029620227874790486 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.25906735751295334, "acc_stderr": 0.03161877917935411, "acc_norm": 0.25906735751295334, "acc_norm_stderr": 0.03161877917935411 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.28205128205128205, "acc_stderr": 0.022815813098896603, "acc_norm": 0.28205128205128205, "acc_norm_stderr": 0.022815813098896603 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.24074074074074073, "acc_stderr": 0.026067159222275805, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.026067159222275805 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.23529411764705882, "acc_stderr



