open-llm-leaderboard-old/details_eldogbbhed__Peagle-9b
收藏数据集概述
数据集简介
该数据集是在对模型 eldogbbhed/Peagle-9b 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建来源:从1次运行中创建。每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
- 额外配置:"results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_eldogbbhed__Peagle-9b", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-11T04:56:46.031904 运行的最新结果:
python { "all": { "acc": 0.6480227913815262, "acc_stderr": 0.032309008293723906, "acc_norm": 0.6486844835270161, "acc_norm_stderr": 0.03296580206090119, "mc1": 0.5532435740514076, "mc1_stderr": 0.017403977522557144, "mc2": 0.7015765443534774, "mc2_stderr": 0.014919144015455953 }, "harness|arc:challenge|25": { "acc": 0.6843003412969283, "acc_stderr": 0.01358257109581529, "acc_norm": 0.7150170648464164, "acc_norm_stderr": 0.013191348179838795 }, "harness|hellaswag|10": { "acc": 0.6965743875721968, "acc_stderr": 0.004587978625582481, "acc_norm": 0.8734315873332006, "acc_norm_stderr": 0.003318093579702919 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7039473684210527, "acc_stderr": 0.03715062154998904, "acc_norm": 0.7039473684210527, "acc_norm_stderr": 0.03715062154998904 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.049431107042371025, "acc_norm": 0.59, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7056603773584905, "acc_stderr": 0.028049186315695255, "acc_norm": 0.7056603773584905, "acc_norm_stderr": 0.028049186315695255 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.75, "acc_stderr": 0.03621034121889507, "acc_norm": 0.75, "acc_norm_stderr": 0.03621034121889507 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.53, "acc_stderr": 0.050161355804659205, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6589595375722543, "acc_stderr": 0.03614665424180826, "acc_norm": 0.6589595375722543, "acc_norm_stderr": 0.03614665424180826 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4411764705882353, "acc_stderr": 0.049406356306056595, "acc_norm": 0.4411764705882353, "acc_norm_stderr": 0.049406356306056595 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5659574468085107, "acc_stderr": 0.032400380867927465, "acc_norm": 0.5659574468085107, "acc_norm_stderr": 0.032400380867927465 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5, "acc_stderr": 0.047036043419179864, "acc_norm": 0.5, "acc_norm_stderr": 0.047036043419179864 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5586206896551724, "acc_stderr": 0.04137931034482758, "acc_norm": 0.5586206896551724, "acc_norm_stderr": 0.04137931034482758 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3915343915343915, "acc_stderr": 0.025138091388851102, "acc_norm": 0.3915343915343915, "acc_norm_stderr": 0.025138091388851102 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.46825396825396826, "acc_stderr": 0.04463112720677172, "acc_norm": 0.46825396825396826, "acc_norm_stderr": 0.04463112720677172 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7935483870967742, "acc_stderr": 0.023025899617188716, "acc_norm": 0.7935483870967742, "acc_norm_stderr": 0.023025899617188716 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5270935960591133, "acc_stderr": 0.03512819077876106, "acc_norm": 0.5270935960591133, "acc_norm_stderr": 0.03512819077876106 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.793939393939394, "acc_stderr": 0.0315841532404771, "acc_norm": 0.793939393939394, "acc_norm_stderr": 0.0315841532404771 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7929292929292929, "acc_stderr": 0.02886977846026705, "acc_norm": 0.7929292929292929, "acc_norm_stderr": 0.02886977846026705 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9067357512953368, "acc_stderr": 0.02098685459328972, "acc_norm": 0.9067357512953368, "acc_norm_stderr": 0.02098685459328972 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6692307692307692, "acc_stderr": 0.023854795680971118, "acc_norm": 0.6692307692307692, "acc_norm_stderr": 0.023854795680971118 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3296296296296296, "acc_stderr": 0.028661201116524572, "acc_norm": 0.3296296296296296, "acc_norm_stderr": 0.028661201116524572 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6428571428571429, "



