open-llm-leaderboard/details_dhmeltzer__Llama-2-7b-hf-wiki30k_r_64_alpha_16
收藏数据集概述
该数据集是在评估模型 dhmeltzer/Llama-2-7b-hf-wiki30k_r_64_alpha_16 的过程中自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每个运行在每个配置中都有特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
最新结果
以下是 2023-08-29T19:45:42.675668 运行的最新结果:
python { "all": { "acc": 0.4674646723979816, "acc_stderr": 0.03520803561024559, "acc_norm": 0.47144963624975206, "acc_norm_stderr": 0.03519372000845246, "mc1": 0.24479804161566707, "mc1_stderr": 0.015051869486715014, "mc2": 0.38637509679052146, "mc2_stderr": 0.013509815622124081 }, "harness|arc:challenge|25": { "acc": 0.4948805460750853, "acc_stderr": 0.01461062489030916, "acc_norm": 0.5324232081911263, "acc_norm_stderr": 0.014580637569995421 }, "harness|hellaswag|10": { "acc": 0.5877315275841466, "acc_stderr": 0.004912370023913015, "acc_norm": 0.7853017327225652, "acc_norm_stderr": 0.004097736838432052 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4666666666666667, "acc_stderr": 0.043097329010363554, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.043097329010363554 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.39473684210526316, "acc_stderr": 0.039777499346220734, "acc_norm": 0.39473684210526316, "acc_norm_stderr": 0.039777499346220734 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.4528301886792453, "acc_stderr": 0.03063562795796182, "acc_norm": 0.4528301886792453, "acc_norm_stderr": 0.03063562795796182 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.4513888888888889, "acc_stderr": 0.04161402398403279, "acc_norm": 0.4513888888888889, "acc_norm_stderr": 0.04161402398403279 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.41040462427745666, "acc_stderr": 0.03750757044895537, "acc_norm": 0.41040462427745666, "acc_norm_stderr": 0.03750757044895537 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237654, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.41702127659574467, "acc_stderr": 0.032232762667117124, "acc_norm": 0.41702127659574467, "acc_norm_stderr": 0.032232762667117124 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2894736842105263, "acc_stderr": 0.04266339443159393, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.04266339443159393 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.47586206896551725, "acc_stderr": 0.041618085035015295, "acc_norm": 0.47586206896551725, "acc_norm_stderr": 0.041618085035015295 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.26455026455026454, "acc_stderr": 0.022717467897708628, "acc_norm": 0.26455026455026454, "acc_norm_stderr": 0.022717467897708628 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.31746031746031744, "acc_norm_stderr": 0.04163453031302859 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.32, "acc_stderr": 0.04688261722621503, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621503 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.4967741935483871, "acc_stderr": 0.02844341422643833, "acc_norm": 0.4967741935483871, "acc_norm_stderr": 0.02844341422643833 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3497536945812808, "acc_stderr": 0.03355400904969566, "acc_norm": 0.3497536945812808, "acc_norm_stderr": 0.03355400904969566 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.593939393939394, "acc_stderr": 0.03834816355401181, "acc_norm": 0.593939393939394, "acc_norm_stderr": 0.03834816355401181 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.4898989898989899, "acc_stderr": 0.03561625488673745, "acc_norm": 0.4898989898989899, "acc_norm_stderr": 0.03561625488673745 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7150259067357513, "acc_stderr": 0.0325771407770966, "acc_norm": 0.7150259067357513, "acc_norm_stderr": 0.0325771407770966 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.43846153846153846, "acc_stderr": 0.025158266016868564, "acc_norm": 0.43846153846153846, "acc_norm_stderr": 0.025158266016868564 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2814814814814815, "acc_stderr": 0.027420019350945277, "acc_norm": 0.2814814814814815, "acc_norm_stderr": 0.027420019350945277 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.432773



