open-llm-leaderboard-old/details_xzuyn__LLaMa-2-PeanutButter_v4-7B
收藏数据集概述
数据集摘要
该数据集是在评估模型 xzuyn/LLaMa-2-PeanutButter_v4-7B 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_xzuyn__LLaMa-2-PeanutButter_v4-7B", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是 2023-08-29T15:15:59.631802 运行的最新结果: python { "all": { "acc": 0.4754535953456773, "acc_stderr": 0.03543074449128995, "acc_norm": 0.4793512530654778, "acc_norm_stderr": 0.03541409593269912, "mc1": 0.26805385556915545, "mc1_stderr": 0.015506204722834557, "mc2": 0.42310904021377665, "mc2_stderr": 0.015624011969941223 }, "harness|arc:challenge|25": { "acc": 0.507679180887372, "acc_stderr": 0.014609667440892567, "acc_norm": 0.5486348122866894, "acc_norm_stderr": 0.014542104569955265 }, "harness|hellaswag|10": { "acc": 0.6188010356502689, "acc_stderr": 0.004846886929763466, "acc_norm": 0.8078072097191794, "acc_norm_stderr": 0.003932184843841659 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.046482319871173156, "acc_norm": 0.31, "acc_norm_stderr": 0.046482319871173156 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.45925925925925926, "acc_stderr": 0.04304979692464243, "acc_norm": 0.45925925925925926, "acc_norm_stderr": 0.04304979692464243 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4276315789473684, "acc_stderr": 0.040260970832965585, "acc_norm": 0.4276315789473684, "acc_norm_stderr": 0.040260970832965585 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.4867924528301887, "acc_stderr": 0.030762134874500482, "acc_norm": 0.4867924528301887, "acc_norm_stderr": 0.030762134874500482 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5, "acc_stderr": 0.04181210050035455, "acc_norm": 0.5, "acc_norm_stderr": 0.04181210050035455 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.43352601156069365, "acc_stderr": 0.03778621079092056, "acc_norm": 0.43352601156069365, "acc_norm_stderr": 0.03778621079092056 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.20588235294117646, "acc_stderr": 0.04023382273617747, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.04023382273617747 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.58, "acc_stderr": 0.04960449637488583, "acc_norm": 0.58, "acc_norm_stderr": 0.04960449637488583 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.42127659574468085, "acc_stderr": 0.03227834510146267, "acc_norm": 0.42127659574468085, "acc_norm_stderr": 0.03227834510146267 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.044346007015849245, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.044346007015849245 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4482758620689655, "acc_stderr": 0.041443118108781506, "acc_norm": 0.4482758620689655, "acc_norm_stderr": 0.041443118108781506 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.30158730158730157, "acc_stderr": 0.023636975996101796, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.023636975996101796 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.29365079365079366, "acc_stderr": 0.04073524322147126, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.04073524322147126 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5032258064516129, "acc_stderr": 0.028443414226438316, "acc_norm": 0.5032258064516129, "acc_norm_stderr": 0.028443414226438316 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3694581280788177, "acc_stderr": 0.03395970381998573, "acc_norm": 0.3694581280788177, "acc_norm_stderr": 0.03395970381998573 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6242424242424243, "acc_stderr": 0.03781887353205982, "acc_norm": 0.6242424242424243, "acc_norm_stderr": 0.03781887353205982 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5909090909090909, "acc_stderr": 0.03502975799413007, "acc_norm": 0.5909090909090909, "acc_norm_stderr": 0.03502975799413007 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7202072538860104, "acc_stderr": 0.032396370467357036, "acc_norm": 0.7202072538860104, "acc_norm_stderr": 0.032396370467357036 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.4666666666666667, "acc_stderr": 0.025294608023986476, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.025294608023986476 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2814814814814815, "acc_stderr": 0.027420019350945287, "acc_norm": 0.2814814814814



