open-llm-leaderboard-old/details_yeontaek__llama-2-70B-ensemble-v8
收藏数据集概述
该数据集是在对模型 yeontaek/llama-2-70B-ensemble-v8 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集由 61 个配置组成,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。
- "train" 拆分始终指向最新的结果。
- 额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_yeontaek__llama-2-70B-ensemble-v8", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是 2023-09-04T20:27:12.407104 运行的最新结果:
python { "all": { "acc": 0.6363532267919642, "acc_stderr": 0.03285197203583459, "acc_norm": 0.6397352881146252, "acc_norm_stderr": 0.03283029655087548, "mc1": 0.45165238678090575, "mc1_stderr": 0.017421480300277643, "mc2": 0.6211306316728467, "mc2_stderr": 0.01529356194952766 }, "harness|arc:challenge|25": { "acc": 0.6561433447098977, "acc_stderr": 0.013880644570156215, "acc_norm": 0.6723549488054608, "acc_norm_stderr": 0.013715847940719339 }, "harness|hellaswag|10": { "acc": 0.6623182632941645, "acc_stderr": 0.004719529099913132, "acc_norm": 0.8456482772356104, "acc_norm_stderr": 0.003605472116762285 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5333333333333333, "acc_stderr": 0.043097329010363554, "acc_norm": 0.5333333333333333, "acc_norm_stderr": 0.043097329010363554 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6842105263157895, "acc_stderr": 0.0378272898086547, "acc_norm": 0.6842105263157895, "acc_norm_stderr": 0.0378272898086547 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.68, "acc_stderr": 0.046882617226215034, "acc_norm": 0.68, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7056603773584905, "acc_stderr": 0.02804918631569525, "acc_norm": 0.7056603773584905, "acc_norm_stderr": 0.02804918631569525 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.75, "acc_stderr": 0.03621034121889507, "acc_norm": 0.75, "acc_norm_stderr": 0.03621034121889507 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6069364161849711, "acc_stderr": 0.03724249595817731, "acc_norm": 0.6069364161849711, "acc_norm_stderr": 0.03724249595817731 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.29411764705882354, "acc_stderr": 0.04533838195929777, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.04533838195929777 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.045126085985421276, "acc_norm": 0.72, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6297872340425532, "acc_stderr": 0.03156564682236784, "acc_norm": 0.6297872340425532, "acc_norm_stderr": 0.03156564682236784 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3684210526315789, "acc_stderr": 0.04537815354939391, "acc_norm": 0.3684210526315789, "acc_norm_stderr": 0.04537815354939391 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5517241379310345, "acc_stderr": 0.04144311810878152, "acc_norm": 0.5517241379310345, "acc_norm_stderr": 0.04144311810878152 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3941798941798942, "acc_stderr": 0.025167982333894143, "acc_norm": 0.3941798941798942, "acc_norm_stderr": 0.025167982333894143 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42857142857142855, "acc_stderr": 0.04426266681379909, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.04426266681379909 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7741935483870968, "acc_stderr": 0.023785577884181015, "acc_norm": 0.7741935483870968, "acc_norm_stderr": 0.023785577884181015 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.45320197044334976, "acc_stderr": 0.03502544650845872, "acc_norm": 0.45320197044334976, "acc_norm_stderr": 0.03502544650845872 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.6, "acc_stderr": 0.049236596391733084, "acc_norm": 0.6, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.031922715695483, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.031922715695483 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8535353535353535, "acc_stderr": 0.025190921114603925, "acc_norm": 0.8535353535353535, "acc_norm_stderr": 0.025190921114603925 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9067357512953368, "acc_stderr": 0.02098685459328972, "acc_norm": 0.9067357512953368, "acc_norm_stderr": 0.02098685459328972 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.023946724741563973, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.023946724741563973 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2777777777777778, "acc_stderr": 0.027309140588230165, "acc_norm": 0.2777777777777778, "acc_norm_stderr":



