open-llm-leaderboard-old/details_dhmeltzer__llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16
收藏数据集概述
数据集摘要
该数据集是在评估模型 dhmeltzer/llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集从 2 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_dhmeltzer__llama-7b-SFT_eli5_wiki65k_1024_r_64_alpha_16", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是来自 2023-08-29T21:13:05.130565 运行的最新结果: python { "all": { "acc": 0.4690258680406254, "acc_stderr": 0.03526595221704812, "acc_norm": 0.47314486143663886, "acc_norm_stderr": 0.03525068980912334, "mc1": 0.28151774785801714, "mc1_stderr": 0.01574402724825605, "mc2": 0.43311675077794465, "mc2_stderr": 0.014085064609011494 }, "harness|arc:challenge|25": { "acc": 0.49829351535836175, "acc_stderr": 0.014611305705056995, "acc_norm": 0.5409556313993175, "acc_norm_stderr": 0.014562291073601227 }, "harness|hellaswag|10": { "acc": 0.5910177255526787, "acc_stderr": 0.00490641198447679, "acc_norm": 0.7913762198765186, "acc_norm_stderr": 0.004054944548370489 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4888888888888889, "acc_stderr": 0.04318275491977976, "acc_norm": 0.4888888888888889, "acc_norm_stderr": 0.04318275491977976 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4276315789473684, "acc_stderr": 0.04026097083296558, "acc_norm": 0.4276315789473684, "acc_norm_stderr": 0.04026097083296558 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.46037735849056605, "acc_stderr": 0.030676096599389184, "acc_norm": 0.46037735849056605, "acc_norm_stderr": 0.030676096599389184 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.4444444444444444, "acc_stderr": 0.04155319955593146, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.04155319955593146 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.04688261722621505, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3988439306358382, "acc_stderr": 0.03733626655383509, "acc_norm": 0.3988439306358382, "acc_norm_stderr": 0.03733626655383509 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.17647058823529413, "acc_stderr": 0.03793281185307809, "acc_norm": 0.17647058823529413, "acc_norm_stderr": 0.03793281185307809 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.41702127659574467, "acc_stderr": 0.032232762667117124, "acc_norm": 0.41702127659574467, "acc_norm_stderr": 0.032232762667117124 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.30701754385964913, "acc_stderr": 0.043391383225798615, "acc_norm": 0.30701754385964913, "acc_norm_stderr": 0.043391383225798615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4206896551724138, "acc_stderr": 0.0411391498118926, "acc_norm": 0.4206896551724138, "acc_norm_stderr": 0.0411391498118926 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.25396825396825395, "acc_stderr": 0.022418042891113946, "acc_norm": 0.25396825396825395, "acc_norm_stderr": 0.022418042891113946 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.36507936507936506, "acc_stderr": 0.04306241259127155, "acc_norm": 0.36507936507936506, "acc_norm_stderr": 0.04306241259127155 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5096774193548387, "acc_stderr": 0.02843867799890955, "acc_norm": 0.5096774193548387, "acc_norm_stderr": 0.02843867799890955 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.33497536945812806, "acc_stderr": 0.033208527423483104, "acc_norm": 0.33497536945812806, "acc_norm_stderr": 0.033208527423483104 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6121212121212121, "acc_stderr": 0.038049136539710114, "acc_norm": 0.6121212121212121, "acc_norm_stderr": 0.038049136539710114 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.48484848484848486, "acc_stderr": 0.0356071651653106, "acc_norm": 0.48484848484848486, "acc_norm_stderr": 0.0356071651653106 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.6683937823834197, "acc_stderr": 0.03397636541089118, "acc_norm": 0.6683937823834197, "acc_norm_stderr": 0.03397636541089118 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.45384615384615384, "acc_stderr": 0.025242770987126177, "acc_norm": 0.45384615384615384, "acc_norm_stderr": 0.025242770987126177 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2814814814814815, "acc_stderr": 0.027420019350945284, "acc_norm": 0.2814814814814815, "acc_norm_stderr": 0.027420019350945284 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.42016806722689076, "acc_stderr": 0.03206183783236152, "acc_norm": 0



