open-llm-leaderboard-old/details_YeungNLP__LongQLoRA-Vicuna-13b-8k
收藏数据集概述
数据集摘要
该数据集是在评估模型 YeungNLP/LongQLoRA-Vicuna-13b-8k 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集由 63 个配置组成,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_YeungNLP__LongQLoRA-Vicuna-13b-8k", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2023-12-18T20:09:15.984207 的运行: python { "all": { "acc": 0.5363588804043325, "acc_stderr": 0.03398265746601784, "acc_norm": 0.5419352215266651, "acc_norm_stderr": 0.03471266124009366, "mc1": 0.31946144430844553, "mc1_stderr": 0.016322644182960498, "mc2": 0.4707041581162466, "mc2_stderr": 0.014774260072447868 }, "harness|arc:challenge|25": { "acc": 0.53839590443686, "acc_stderr": 0.01456824555029636, "acc_norm": 0.5639931740614335, "acc_norm_stderr": 0.014491225699230916 }, "harness|hellaswag|10": { "acc": 0.6042620991834295, "acc_stderr": 0.004880092083408043, "acc_norm": 0.8104959171479785, "acc_norm_stderr": 0.0039110756628832725 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5185185185185185, "acc_stderr": 0.043163785995113245, "acc_norm": 0.5185185185185185, "acc_norm_stderr": 0.043163785995113245 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5921052631578947, "acc_stderr": 0.03999309712777474, "acc_norm": 0.5921052631578947, "acc_norm_stderr": 0.03999309712777474 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5773584905660377, "acc_stderr": 0.03040233144576954, "acc_norm": 0.5773584905660377, "acc_norm_stderr": 0.03040233144576954 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5486111111111112, "acc_stderr": 0.041614023984032786, "acc_norm": 0.5486111111111112, "acc_norm_stderr": 0.041614023984032786 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.41, "acc_stderr": 0.04943110704237102, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5317919075144508, "acc_stderr": 0.03804749744364764, "acc_norm": 0.5317919075144508, "acc_norm_stderr": 0.03804749744364764 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.29411764705882354, "acc_stderr": 0.04533838195929777, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.04533838195929777 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.64, "acc_stderr": 0.04824181513244218, "acc_norm": 0.64, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.39574468085106385, "acc_stderr": 0.03196758697835363, "acc_norm": 0.39574468085106385, "acc_norm_stderr": 0.03196758697835363 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2894736842105263, "acc_stderr": 0.042663394431593935, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.042663394431593935 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.496551724137931, "acc_stderr": 0.041665675771015785, "acc_norm": 0.496551724137931, "acc_norm_stderr": 0.041665675771015785 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.328042328042328, "acc_stderr": 0.024180497164376907, "acc_norm": 0.328042328042328, "acc_norm_stderr": 0.024180497164376907 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.373015873015873, "acc_stderr": 0.04325506042017086, "acc_norm": 0.373015873015873, "acc_norm_stderr": 0.04325506042017086 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6129032258064516, "acc_stderr": 0.027709359675032495, "acc_norm": 0.6129032258064516, "acc_norm_stderr": 0.027709359675032495 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3645320197044335, "acc_stderr": 0.0338640574606209, "acc_norm": 0.3645320197044335, "acc_norm_stderr": 0.0338640574606209 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.63, "acc_stderr": 0.048523658709391, "acc_norm": 0.63, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6848484848484848, "acc_stderr": 0.0362773057502241, "acc_norm": 0.6848484848484848, "acc_norm_stderr": 0.0362773057502241 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.6818181818181818, "acc_stderr": 0.0331847733384533, "acc_norm": 0.6818181818181818, "acc_norm_stderr": 0.0331847733384533 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7927461139896373, "acc_stderr": 0.02925282329180363, "acc_norm": 0.7927461139896373, "acc_norm_stderr": 0.02925282329180363 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5128205128205128, "acc_stderr": 0.025342671293807264, "acc_norm": 0.5128205128205128, "acc_norm_stderr": 0.025342671293807264 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3037037037037037, "acc_stderr": 0.028037929969114986, "acc_norm": 0.3037037037037037, "acc_norm_stderr": 0.028037929969114986 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.5252100840336



