open-llm-leaderboard-old/details_eren23__FrankenBeagle-SmallOverlap-test
收藏数据集概述
该数据集是在评估模型 eren23/FrankenBeagle-SmallOverlap-test 在 Open LLM Leaderboard 上的运行过程中自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
- 配置数量:63 个配置
- 数据来源:从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- 训练分割:"train" 分割始终指向最新的结果。
- 结果配置:"results" 配置存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_eren23__FrankenBeagle-SmallOverlap-test", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果 的摘要:
python { "all": { "acc": 0.6516048577093372, "acc_stderr": 0.03217886824872047, "acc_norm": 0.6522986567968078, "acc_norm_stderr": 0.03283383614658343, "mc1": 0.5642594859241126, "mc1_stderr": 0.017358345398863134, "mc2": 0.6969160518300113, "mc2_stderr": 0.015146787132780715 }, "harness|arc:challenge|25": { "acc": 0.6945392491467577, "acc_stderr": 0.013460080478002505, "acc_norm": 0.7201365187713311, "acc_norm_stderr": 0.01311904089772592 }, "harness|hellaswag|10": { "acc": 0.7171878111929895, "acc_stderr": 0.004494454911844622, "acc_norm": 0.8815972913762199, "acc_norm_stderr": 0.003224240722351316 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6074074074074074, "acc_stderr": 0.0421850621536888, "acc_norm": 0.6074074074074074, "acc_norm_stderr": 0.0421850621536888 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6907894736842105, "acc_stderr": 0.037610708698674805, "acc_norm": 0.6907894736842105, "acc_norm_stderr": 0.037610708698674805 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.65, "acc_stderr": 0.0479372485441102, "acc_norm": 0.65, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7018867924528301, "acc_stderr": 0.02815283794249386, "acc_norm": 0.7018867924528301, "acc_norm_stderr": 0.02815283794249386 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7638888888888888, "acc_stderr": 0.03551446610810826, "acc_norm": 0.7638888888888888, "acc_norm_stderr": 0.03551446610810826 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.57, "acc_stderr": 0.04975698519562428, "acc_norm": 0.57, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6705202312138728, "acc_stderr": 0.03583901754736412, "acc_norm": 0.6705202312138728, "acc_norm_stderr": 0.03583901754736412 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726366, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726366 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5617021276595745, "acc_stderr": 0.03243618636108102, "acc_norm": 0.5617021276595745, "acc_norm_stderr": 0.03243618636108102 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.47368421052631576, "acc_stderr": 0.04697085136647863, "acc_norm": 0.47368421052631576, "acc_norm_stderr": 0.04697085136647863 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5517241379310345, "acc_stderr": 0.04144311810878151, "acc_norm": 0.5517241379310345, "acc_norm_stderr": 0.04144311810878151 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4074074074074074, "acc_stderr": 0.025305906241590632, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.025305906241590632 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.47619047619047616, "acc_stderr": 0.04467062628403273, "acc_norm": 0.47619047619047616, "acc_norm_stderr": 0.04467062628403273 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7838709677419354, "acc_stderr": 0.02341529343356853, "acc_norm": 0.7838709677419354, "acc_norm_stderr": 0.02341529343356853 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.0328766675860349, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.0328766675860349 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7777777777777778, "acc_stderr": 0.02962022787479048, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.02962022787479048 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9119170984455959, "acc_stderr": 0.02045374660160103, "acc_norm": 0.9119170984455959, "acc_norm_stderr": 0.02045374660160103 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6820512820512821, "acc_stderr": 0.023610884308927865, "acc_norm": 0.6820512820512821, "acc_norm_stderr": 0.023610884308927865 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3296296296296296, "acc_stderr": 0.028661201116524572, "acc_norm": 0.3296296296296296, "acc_norm_stderr": 0.028661201116524572 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6596638655462185, "acc_stderr": 0.030778057422931673, "acc_norm": 0.6596638655462185, "acc_norm_stderr": 0.030778057422931673 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.36423841059602646, "acc_stderr": 0.03929111781242742, "acc_norm": 0.36423841059602646, "acc_norm_stderr": 0.03929111781242742 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8366972477064221, "acc_stderr": 0.01584825580650155, "acc_norm": 0.8366972477064221, "acc_norm_stderr": 0.01584825580650155 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5555555555555556, "acc_stderr": 0.03388857118502325, "acc_norm": 0.5555555555555556, "acc_norm_stderr": 0.03388857118502325 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8382352941176471, "acc_stderr": 0.025845017986926917, "acc_norm": 0.8382352941176471, "acc_norm_stderr": 0.025845017986926917 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7848101265822784, "acc_stderr": 0.02675082699467617, "acc_norm": 0.7848101265822784, "acc_norm_stderr": 0.02675082699467617 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6995515695067265, "acc_stderr": 0.030769352008229143, "acc_norm": 0.6995515695067265, "acc_norm_stderr": 0.030769352008229143 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7786259541984732, "acc_stderr": 0.03641297081313729, "acc_norm": 0.7786259541984732, "acc_norm_stderr": 0.03641297081313729 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7768595041322314, "acc_stderr": 0.03800754475228732, "acc_norm": 0.7768595041322314, "acc_norm_stderr": 0.03800754475228732 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7685185185185185, "acc_stderr": 0.04077494709252627, "acc_norm": 0.7685185185185185, "acc_norm_stderr": 0.04077494709252627 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7730061349693251, "acc_stderr": 0.03291099578615769, "acc_norm": 0.7730061349693251, "acc_norm_stderr": 0.03291099578615769 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.44642857142857145, "acc_stderr": 0.04718471485219588, "acc_norm": 0.44642857142857145, "acc_norm_stderr": 0.04718471485219588 }, "harness|hendrycksTest-management|5": { "acc": 0.7766990291262136, "acc_stderr": 0.04123553189891431, "acc_norm": 0.7766990291262136, "acc_norm_stderr": 0.04123553189891431 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8675213675213675, "acc_stderr": 0.022209309073165612, "acc_norm": 0.8675213675213675, "acc_norm_stderr": 0.022209309073165612 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8263090676883781, "acc_stderr": 0.01354741565866226, "acc_norm": 0.8263090676883781, "acc_norm_stderr": 0.01354741565866226 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7254335260115607, "acc_stderr": 0.02402774515526502, "acc_norm": 0.7254335260115607, "acc_norm_stderr": 0.02402774515526502 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.42793296089385474, "acc_stderr": 0.01654788799741611, "acc_norm": 0.42793296089385474, "acc_norm_stderr": 0.01654788799741611 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7189542483660131, "acc_stderr": 0.025738854797818737, "acc_norm": 0.7189542483660131, "acc_norm_stderr": 0.025738854797818737 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7170418006430869, "acc_stderr": 0.025583062489984813, "acc_norm": 0.7170418006430869, "acc_norm_stderr": 0.025583062489984813 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.75, "acc_stderr": 0.02409347123262133, "acc_norm": 0.75, "acc_norm_stderr": 0.02409347123262133 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.4787234042553192, "acc_stderr": 0.029800481645628693, "acc_norm": 0.4787234042553192, "acc_norm_stderr": 0.029800481645628693 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.47131681877444587, "acc_stderr": 0.012749206007657476, "acc_norm": 0.47131681877444587, "acc_norm_stderr": 0.012749206007657476 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6838235294117647, "acc_stderr": 0.028245687391462927, "acc_norm": 0.6838235294117647, "acc_norm_stderr": 0.028245687391462927 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6699346405228758, "acc_stderr": 0.019023726160724553, "acc_norm": 0.6699346405228758, "acc_norm_stderr": 0.019023726160724553 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6636363636363637, "acc_stderr": 0.04525393596302506, "acc_norm": 0.6636363636363637, "acc_norm_stderr": 0.04525393596302506 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7224489795918367, "acc_stderr": 0.02866685779027465, "acc_norm": 0.7224489795918367, "acc_norm_stderr": 0.02866685779027465 }, "harness|hendrycksTest-sociology|5": { "acc": 0.845771144278607, "acc_stderr": 0.025538433368578323, "acc_norm": 0.845771144278607, "acc_norm_stderr": 0.025538433368578323 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.87, "acc_stderr": 0.03379976689896308, "acc_norm": 0.87, "acc_norm_stderr": 0.03379976689896308 }, "harness|hendrycksTest-virology|5": { "acc": 0.572289156626506, "acc_stderr": 0.038515976837185335, "acc_norm": 0.572289156626506, "acc_norm_stderr": 0.038515976837185335 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8245614035087719, "acc_stderr": 0.029170885500727665, "acc_norm": 0.8245614035087719, "acc_norm_stderr": 0.029170885500727665 }, "harness|truthfulqa:mc|0": { "mc1": 0.5642594859241126, "mc1_stderr": 0.017358345398863134, "mc2": 0.6969160518300113, "mc2_stderr": 0.015146787132780715 }, "harness|winogrande|5": { "acc": 0.8184688239936859, "acc_stderr": 0.01083327651500748 }, "harness|gsm8k|5": { "acc": 0.6338134950720242, "acc_stderr": 0.013270100238748835 } }
数据集配置
-
配置名称:harness_arc_challenge_25
- 数据文件:
- 分割:2024_01_28T18_01_48.091573
- 路径:
**/details_harness|arc:challenge|25_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:latest
- 路径:
**/details_harness|arc:challenge|25_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:2024_01_28T18_01_48.091573
- 数据文件:
-
配置名称:harness_gsm8k_5
- 数据文件:
- 分割:2024_01_28T18_01_48.091573
- 路径:
**/details_harness|gsm8k|5_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:latest
- 路径:
**/details_harness|gsm8k|5_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:2024_01_28T18_01_48.091573
- 数据文件:
-
配置名称:harness_hellaswag_10
- 数据文件:
- 分割:2024_01_28T18_01_48.091573
- 路径:
**/details_harness|hellaswag|10_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:latest
- 路径:
**/details_harness|hellaswag|10_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:2024_01_28T18_01_48.091573
- 数据文件:
-
配置名称:harness_hendrycksTest_5
- 数据文件:
- 分割:2024_01_28T18_01_48.091573
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-anatomy|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-astronomy|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-college_biology|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-college_physics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-computer_security|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-econometrics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-formal_logic|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-global_facts|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_biology|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_chemistry|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_computer_science|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_european_history|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_geography|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_government_and_politics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_macroeconomics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_mathematics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_microeconomics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_physics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_psychology|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_statistics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_us_history|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-high_school_world_history|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-human_aging|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-human_sexuality|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-international_law|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-jurisprudence|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-logical_fallacies|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-machine_learning|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-management|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-marketing|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-medical_genetics|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-miscellaneous|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-moral_disputes|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-moral_scenarios|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-nutrition|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-philosophy|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-prehistory|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-professional_accounting|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-professional_law|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-professional_medicine|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-professional_psychology|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-public_relations|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-security_studies|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-sociology|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-us_foreign_policy|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-virology|5_2024-01-28T18-01-48.091573.parquet**/details_harness|hendrycksTest-world_religions|5_2024-01-28T18-01-48.091573.parquet
- 路径:
- 分割:2024_01_28T18_01_48.091573
- 数据文件:




