open-llm-leaderboard-old/details_OpenBuddy__openbuddy-mixtral-8x7b-v16.1-32k
收藏数据集概述
数据集简介
该数据集是在评估模型 OpenBuddy/openbuddy-mixtral-8x7b-v16.1-32k 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集结构
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-8x7b-v16.1-32k", "harness_winogrande_5", split="train")
最新结果
以下是 2023-12-30T16:16:53.571803 运行的最新结果:
python { "all": { "acc": 0.6939517211944045, "acc_stderr": 0.030232673494217974, "acc_norm": 0.7084301138333359, "acc_norm_stderr": 0.031054743745039477, "mc1": 0.397796817625459, "mc1_stderr": 0.01713393424855964, "mc2": 0.5597457443511287, "mc2_stderr": 0.014917533204367936 }, "harness|arc:challenge|25": { "acc": 0.23976109215017063, "acc_stderr": 0.012476304127453947, "acc_norm": 0.2909556313993174, "acc_norm_stderr": 0.013273077865907586 }, "harness|hellaswag|10": { "acc": 0.6341366261700856, "acc_stderr": 0.004806870285747291, "acc_norm": 0.8227444732125074, "acc_norm_stderr": 0.0038110434120246514 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6666666666666666, "acc_stderr": 0.04072314811876837, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.04072314811876837 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7763157894736842, "acc_stderr": 0.033911609343436025, "acc_norm": 0.7763157894736842, "acc_norm_stderr": 0.033911609343436025 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7849056603773585, "acc_stderr": 0.02528839450289137, "acc_norm": 0.7849056603773585, "acc_norm_stderr": 0.02528839450289137 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8333333333333334, "acc_stderr": 0.031164899666948614, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.031164899666948614 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6820809248554913, "acc_stderr": 0.0355068398916558, "acc_norm": 0.6820809248554913, "acc_norm_stderr": 0.0355068398916558 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5196078431372549, "acc_stderr": 0.04971358884367406, "acc_norm": 0.5196078431372549, "acc_norm_stderr": 0.04971358884367406 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.79, "acc_stderr": 0.040936018074033256, "acc_norm": 0.79, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6680851063829787, "acc_stderr": 0.030783736757745653, "acc_norm": 0.6680851063829787, "acc_norm_stderr": 0.030783736757745653 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.6842105263157895, "acc_stderr": 0.043727482902780085, "acc_norm": 0.6842105263157895, "acc_norm_stderr": 0.043727482902780085 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6758620689655173, "acc_stderr": 0.03900432069185555, "acc_norm": 0.6758620689655173, "acc_norm_stderr": 0.03900432069185555 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4894179894179894, "acc_stderr": 0.02574554227604548, "acc_norm": 0.4894179894179894, "acc_norm_stderr": 0.02574554227604548 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5317460317460317, "acc_stderr": 0.04463112720677173, "acc_norm": 0.5317460317460317, "acc_norm_stderr": 0.04463112720677173 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8451612903225807, "acc_stderr": 0.020579287326583227, "acc_norm": 0.8451612903225807, "acc_norm_stderr": 0.020579287326583227 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5862068965517241, "acc_stderr": 0.03465304488406796, "acc_norm": 0.5862068965517241, "acc_norm_stderr": 0.03465304488406796 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.76, "acc_stderr": 0.042923469599092816, "acc_norm": 0.76, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8242424242424242, "acc_stderr": 0.02972094300622445, "acc_norm": 0.8242424242424242, "acc_norm_stderr": 0.02972094300622445 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8535353535353535, "acc_stderr": 0.025190921114603918, "acc_norm": 0.8535353535353535, "acc_norm_stderr": 0.025190921114603918 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9222797927461139, "acc_stderr": 0.01932180555722315, "acc_norm": 0.9222797927461139, "acc_norm_stderr": 0.01932180555722315 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7051282051282052, "acc_stderr": 0.023119362758232297, "acc_norm": 0.7051282051282052, "acc_norm_stderr": 0.023119362758232297 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.362962962962963, "acc_stderr": 0.029318203645206865, "acc_norm": 0.362962962962963, "acc_norm_stderr": 0.029318203645206865 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.8109243697478992, "acc_stderr": 0.025435119438105353, "acc_norm": 0.8109243697478992, "acc_norm_stderr": 0.025435119438105353 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.4503311258278146, "acc_stderr": 0.04062290018683776, "acc_norm": 0.4503311258278146, "acc_norm_stderr": 0.04062290018683776 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8917431192660551, "acc_stderr": 0.013321348447611764, "acc_norm": 0.8917431192660551, "acc_norm_stderr": 0.013321348447611764 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5787037037037037, "acc_stderr": 0.033674621388960775, "acc_norm": 0.5787037037037037, "acc_norm_stderr": 0.033674621388960775 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8480392156862745, "acc_stderr": 0.025195658428931792, "acc_norm": 0.8480392156862745, "acc_norm_stderr": 0.025195658428931792 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8776371308016878, "acc_stderr": 0.021331741829746793, "acc_norm": 0.8776371308016878, "acc_norm_stderr": 0.021331741829746793 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.7533632286995515, "acc_stderr": 0.028930413120910884, "acc_norm": 0.7533632286995515, "acc_norm_stderr": 0.028930413120910884 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7938931297709924, "acc_stderr": 0.035477710041594626, "acc_norm": 0.7938931297709924, "acc_norm_stderr": 0.035477710041594626 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8512396694214877, "acc_stderr": 0.03248470083807194, "acc_norm": 0.8512396694214877, "acc_norm_stderr": 0.03248470083807194 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.8518518518518519, "acc_stderr": 0.03434300243630999, "acc_norm": 0.8518518518518519, "acc_norm_stderr": 0.03434300243630999 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7975460122699386, "acc_stderr": 0.031570650789119005, "acc_norm": 0.7975460122699386, "acc_norm_stderr": 0.031570650789119005 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.5803571428571429, "acc_stderr": 0.046840993210771065, "acc_norm": 0.5803571428571429, "acc_norm_stderr": 0.046840993210771065 }, "harness|hendrycksTest-management|5": { "acc": 0.8446601941747572, "acc_stderr": 0.035865947385739734, "acc_norm": 0.8446601941747572, "acc_norm_stderr": 0.035865947385739734 }, "harness|hendrycksTest-marketing|5": { "acc": 0.905982905982906, "acc_stderr": 0.01911989279892498, "acc_norm": 0.905982905982906, "acc_norm_stderr": 0.01911989279892498 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8748403575989783, "acc_stderr": 0.011832954239305736, "acc_norm": 0.8748403575989783, "acc_norm_stderr": 0.011832954239305736 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7745664739884393, "acc_stderr": 0.022497230190967558, "acc_norm": 0.7745664739884393, "acc_norm_stderr": 0.022497230190967558 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.5273743016759777, "acc_stderr": 0.016697420650642752, "acc_norm": 0.5273743016759777, "acc_norm_stderr": 0.016697420650642752 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7973856209150327, "acc_stderr": 0.023015446877985693, "acc_norm": 0.7973856209150327, "acc_norm_stderr": 0.023015446877985693 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7942122186495176, "acc_stderr": 0.022961339906764244, "acc_norm": 0.7942122186495176, "acc_norm_stderr": 0.022961339906764244 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.8333333333333334, "acc_stderr": 0.020736358408060006, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.020736358408060006 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.524822695035461, "acc_stderr": 0.0297907192438297, "acc_norm": 0.524822695035461, "acc_norm_stderr": 0.0297907192438297 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.5195567144719687, "acc_stderr": 0.012760464028289299, "acc_norm": 0.5195567144719687, "acc_norm_stderr": 0.012760464028289299 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.7794117647058824, "acc_stderr": 0.02518778666022726, "acc_norm": 0.7794117647058824, "acc_norm_stderr": 0.02518778666022726 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.7549019607843137, "acc_stderr": 0.017401816711427657, "acc_norm": 0.7549019607843137, "acc_norm_stderr": 0.017401816711427657 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.7090909090909091, "acc_stderr": 0.04350271442923243, "acc_norm": 0.7090909090909091, "acc_norm_stderr": 0.04350271442923243 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7959183673469388, "acc_stderr": 0.025801283475090506, "acc_norm": 0.7959183673469388, "acc_norm_stderr": 0.025801283475090506 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8706467661691543, "acc_stderr": 0.023729830881018526, "acc_norm": 0.8706467661691543, "acc_norm_stderr": 0.023729830881018526 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.88, "acc_stderr": 0.032659863237109066, "acc_norm": 0.88, "acc_norm_stderr": 0.032659863237109066 }, "harness|hendrycksTest-virology|5": { "acc": 0.5240963855421686, "acc_stderr": 0.03887971849597264, "acc_norm": 0.5240963855421686, "acc_norm_stderr": 0.03887971849597264 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8654970760233918, "acc_stderr": 0.026168221344662297, "acc_norm": 0.8654970760233918, "acc_norm_stderr": 0.026168221344662297 }, "harness|truthfulqa:mc|0": { "mc1": 0.397796817625459, "mc1_stderr": 0.01713393424855964, "mc2": 0.5597457443511287, "mc2_stderr": 0.014917533204367936 }, "harness|winogrande|5": { "acc": 0.7734806629834254, "acc_stderr": 0.011764149054698334 }, "harness|gsm8k|5": { "acc": 0.0, "acc_stderr": 0.0 } }
配置详情
-
harness_arc_challenge_25
- 分割: 2023_12_30T16_16_53.571803
- 路径:
**/details_harness|arc:challenge|25_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|arc:challenge|25_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: 2023_12_30T16_16_53.571803
-
harness_gsm8k_5
- 分割: 2023_12_30T16_16_53.571803
- 路径:
**/details_harness|gsm8k|5_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|gsm8k|5_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: 2023_12_30T16_16_53.571803
-
harness_hellaswag_10
- 分割: 2023_12_30T16_16_53.571803
- 路径:
**/details_harness|hellaswag|10_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|hellaswag|10_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: 2023_12_30T16_16_53.571803
-
harness_hendrycksTest_5
- 分割: 2023_12_30T16_16_53.571803
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-anatomy|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-astronomy|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-business_ethics|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-college_biology|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-college_chemistry|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-college_computer_science|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-college_mathematics|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-college_medicine|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-college_physics|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-computer_security|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-econometrics|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-formal_logic|5_2023-12-30T16-16-53.571803.parquet**/details_harness|hendrycksTest-global_facts|5_2023-12-30T16-16-53.571803.parquet
- 路径:
- 分割: 2023_12_30T16_16_53.571803



