open-llm-leaderboard/details_CHIH-HUNG__llama-2-13b-open_orca_20w
收藏数据集概述
数据集摘要
该数据集是在对模型 CHIH-HUNG/llama-2-13b-open_orca_20w 进行评估运行时自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_CHIH-HUNG__llama-2-13b-open_orca_20w", "harness_truthfulqa_mc_0", split="train")
最新结果
这些是最新的结果,来自 2023-08-30T09:16:35.193244 的运行: python { "all": { "acc": 0.5638417257556474, "acc_stderr": 0.03429434698845683, "acc_norm": 0.5680270426638301, "acc_norm_stderr": 0.03427324810755194, "mc1": 0.3011015911872705, "mc1_stderr": 0.016058999026100612, "mc2": 0.4313573698064727, "mc2_stderr": 0.014673057614679777 }, "harness|arc:challenge|25": { "acc": 0.5588737201365188, "acc_stderr": 0.014509747749064664, "acc_norm": 0.5989761092150171, "acc_norm_stderr": 0.014322255790719869 }, "harness|hellaswag|10": { "acc": 0.6183031268671579, "acc_stderr": 0.004848099661619698, "acc_norm": 0.8251344353714399, "acc_norm_stderr": 0.0037907576465759014 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526066, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526066 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4666666666666667, "acc_stderr": 0.043097329010363554, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.043097329010363554 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5723684210526315, "acc_stderr": 0.04026097083296564, "acc_norm": 0.5723684210526315, "acc_norm_stderr": 0.04026097083296564 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6377358490566037, "acc_stderr": 0.029582245128384303, "acc_norm": 0.6377358490566037, "acc_norm_stderr": 0.029582245128384303 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5694444444444444, "acc_stderr": 0.04140685639111502, "acc_norm": 0.5694444444444444, "acc_norm_stderr": 0.04140685639111502 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.45, "acc_stderr": 0.049999999999999996, "acc_norm": 0.45, "acc_norm_stderr": 0.049999999999999996 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5144508670520231, "acc_stderr": 0.03810871630454764, "acc_norm": 0.5144508670520231, "acc_norm_stderr": 0.03810871630454764 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.28431372549019607, "acc_stderr": 0.04488482852329017, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.04408440022768078, "acc_norm": 0.74, "acc_norm_stderr": 0.04408440022768078 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4425531914893617, "acc_stderr": 0.03246956919789958, "acc_norm": 0.4425531914893617, "acc_norm_stderr": 0.03246956919789958 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.21929824561403508, "acc_stderr": 0.03892431106518754, "acc_norm": 0.21929824561403508, "acc_norm_stderr": 0.03892431106518754 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5310344827586206, "acc_stderr": 0.04158632762097828, "acc_norm": 0.5310344827586206, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3492063492063492, "acc_stderr": 0.024552292209342665, "acc_norm": 0.3492063492063492, "acc_norm_stderr": 0.024552292209342665 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4126984126984127, "acc_stderr": 0.04403438954768176, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.04403438954768176 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6516129032258065, "acc_stderr": 0.027104826328100944, "acc_norm": 0.6516129032258065, "acc_norm_stderr": 0.027104826328100944 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.46798029556650245, "acc_stderr": 0.035107665979592154, "acc_norm": 0.46798029556650245, "acc_norm_stderr": 0.035107665979592154 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6787878787878788, "acc_stderr": 0.036462049632538115, "acc_norm": 0.6787878787878788, "acc_norm_stderr": 0.036462049632538115 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7121212121212122, "acc_stderr": 0.03225883512300992, "acc_norm": 0.7121212121212122, "acc_norm_stderr": 0.03225883512300992 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8341968911917098, "acc_stderr": 0.026839845022314415, "acc_norm": 0.8341968911917098, "acc_norm_stderr": 0.026839845022314415 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5025641025641026, "acc_stderr": 0.025350672979412195, "acc_norm": 0.5025641025641026, "acc_norm_stderr": 0.025350672979412195 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2777777777777778, "acc_stderr": 0.027309140588230182, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.027309140588230182 }, "harness|



