open-llm-leaderboard-old/details_Locutusque__Hyperion-1.5-Mistral-7B
收藏数据集概述
数据集简介
该数据集是在评估模型 Locutusque/Hyperion-1.5-Mistral-7B 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 数据来源:数据集来自1次运行,每次运行在每个配置中都有一个特定的分割,分割名称使用运行的时间戳。
- 最新结果:"train" 分割始终指向最新的结果。
- 汇总结果:一个额外的配置 "results" 存储所有运行的汇总结果,用于计算和显示在 Open LLM Leaderboard 上的汇总指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Locutusque__Hyperion-1.5-Mistral-7B", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-03T06:03:37.474849 运行的最新结果:
python { "all": { "acc": 0.6333329963434157, "acc_stderr": 0.03235977051574735, "acc_norm": 0.6385840230775544, "acc_norm_stderr": 0.033010512433886766, "mc1": 0.2729498164014688, "mc1_stderr": 0.015594753632006526, "mc2": 0.4177755099939499, "mc2_stderr": 0.014043002514832767 }, "harness|arc:challenge|25": { "acc": 0.5750853242320819, "acc_stderr": 0.014445698968520769, "acc_norm": 0.6049488054607508, "acc_norm_stderr": 0.014285898292938165 }, "harness|hellaswag|10": { "acc": 0.6321449910376419, "acc_stderr": 0.0048123610604939235, "acc_norm": 0.836387173869747, "acc_norm_stderr": 0.003691678495767967 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6296296296296297, "acc_stderr": 0.041716541613545426, "acc_norm": 0.6296296296296297, "acc_norm_stderr": 0.041716541613545426 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6644736842105263, "acc_stderr": 0.03842498559395269, "acc_norm": 0.6644736842105263, "acc_norm_stderr": 0.03842498559395269 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7018867924528301, "acc_stderr": 0.02815283794249387, "acc_norm": 0.7018867924528301, "acc_norm_stderr": 0.02815283794249387 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7291666666666666, "acc_stderr": 0.03716177437566017, "acc_norm": 0.7291666666666666, "acc_norm_stderr": 0.03716177437566017 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.53, "acc_stderr": 0.050161355804659205, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6647398843930635, "acc_stderr": 0.03599586301247077, "acc_norm": 0.6647398843930635, "acc_norm_stderr": 0.03599586301247077 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3627450980392157, "acc_stderr": 0.04784060704105652, "acc_norm": 0.3627450980392157, "acc_norm_stderr": 0.04784060704105652 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.78, "acc_stderr": 0.04163331998932261, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932261 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5702127659574469, "acc_stderr": 0.03236214467715564, "acc_norm": 0.5702127659574469, "acc_norm_stderr": 0.03236214467715564 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4824561403508772, "acc_stderr": 0.04700708033551038, "acc_norm": 0.4824561403508772, "acc_norm_stderr": 0.04700708033551038 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5793103448275863, "acc_stderr": 0.0411391498118926, "acc_norm": 0.5793103448275863, "acc_norm_stderr": 0.0411391498118926 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.024870815251057093, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.024870815251057093 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.40476190476190477, "acc_stderr": 0.04390259265377562, "acc_norm": 0.40476190476190477, "acc_norm_stderr": 0.04390259265377562 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7580645161290323, "acc_stderr": 0.024362599693031096, "acc_norm": 0.7580645161290323, "acc_norm_stderr": 0.024362599693031096 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4876847290640394, "acc_stderr": 0.035169204442208966, "acc_norm": 0.4876847290640394, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621504, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.032876667586034906, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.032876667586034906 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586804, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586804 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8704663212435233, "acc_stderr": 0.024233532297758733, "acc_norm": 0.8704663212435233, "acc_norm_stderr": 0.024233532297758733 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6564102564102564, "acc_stderr": 0.024078696580635484, "acc_norm": 0.6564102564102564, "acc_norm_stderr": 0.024078696580635484 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34814814814814815, "acc_stderr": 0.02904560029061626, "acc_norm": 0.34814814814814815, "acc_norm_stderr": 0.02904560029061626 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6680672268907563, "acc_stderr": 0.03058869701378364, "acc_norm": 0.6680672268907563, "acc_norm_stderr": 0.03058869701378364 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.33774834437086093, "acc_stderr": 0.0386155754625517, "acc_norm": 0.33774834437086093, "acc_norm_stderr": 0.0386155754625517 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8275229357798165, "acc_stderr": 0.01619780795684805, "acc_norm": 0.8275229357798165, "acc_norm_stderr": 0.01619780795684805 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5324074074074074, "acc_stderr": 0.03402801581358966, "acc_norm": 0.5324074074074074, "acc_norm_stderr": 0.03402801581358966 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8088235294117647, "acc_stderr": 0.02759917430064076, "acc_norm": 0.8088235294117647, "acc_norm_stderr": 0.02759917430064076 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7510548523206751, "acc_stderr": 0.028146970599422644, "acc_norm": 0.7510548523206751, "acc_norm_stderr": 0.028146970599422644 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6816143497757847, "acc_stderr": 0.03126580522513713, "acc_norm": 0.6816143497757847, "acc_norm_stderr": 0.03126580522513713 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7862595419847328, "acc_stderr": 0.0359546161177469, "acc_norm": 0.7862595419847328, "acc_norm_stderr": 0.0359546161177469 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7768595041322314, "acc_stderr": 0.03800754475228732, "acc_norm": 0.7768595041322314, "acc_norm_stderr": 0.03800754475228732 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.75, "acc_stderr": 0.04186091791394607, "acc_norm": 0.75, "acc_norm_stderr": 0.04186091791394607 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7914110429447853, "acc_stderr": 0.031921934489347235, "acc_norm": 0.7914110429447853, "acc_norm_stderr": 0.031921934489347235 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.49107142857142855, "acc_stderr": 0.04745033255489123, "acc_norm": 0.49107142857142855, "acc_norm_stderr": 0.04745033255489123 }, "harness|hendrycksTest-management|5": { "acc": 0.7961165048543689, "acc_stderr": 0.03989139859531771, "acc_norm": 0.7961165048543689, "acc_norm_stderr": 0.03989139859531771 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8717948717948718, "acc_stderr": 0.02190190511507333, "acc_norm": 0.8717948717948718, "acc_norm_stderr": 0.02190190511507333 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.74, "acc_stderr": 0.04408440022768078, "acc_norm": 0.74, "acc_norm_stderr": 0.04408440022768078 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8071519795657727, "acc_stderr": 0.014108533515757431, "acc_norm": 0.8071519795657727, "acc_norm_stderr": 0.014108533515757431 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7167630057803468, "acc_stderr": 0.024257901705323378, "acc_norm": 0.7167630057803468, "acc_norm_stderr": 0.024257901705323378 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.2994413407821229, "acc_stderr": 0.015318257745976706, "acc_norm": 0.2994413407821229, "acc_norm_stderr": 0.015318257745976706 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7483660130718954, "acc_stderr": 0.024848018263875195, "acc_norm": 0.7483660130718954, "acc_norm_stderr": 0.024848018263875195 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7009646302250804, "acc_stderr": 0.02600330111788514, "acc_norm": 0.7009646302250804, "acc_norm_stderr": 0.02600330111788514 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7191358024691358, "acc_stderr": 0.02500646975579921, "acc_norm": 0.7191358024691358, "acc_norm_stderr": 0.02500646975579921 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.46099290780141844, "acc_stderr": 0.02973659252642444, "acc_norm": 0.46099290780141844, "acc_norm_stderr": 0.02973659252642444 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.44198174706649285, "acc_stderr": 0.01268397251359881, "acc_norm": 0.44198174706649285, "acc_norm_stderr": 0.01268397251359881 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6691176470588235, "acc_stderr": 0.028582709753898445, "acc_norm": 0.6691176470588235, "acc_norm_stderr": 0.028582709753898445 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6454248366013072, "acc_stderr": 0.019353360547553697, "acc_norm": 0.6454248366013072, "acc_norm_stderr": 0.019353360547553697 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6545454545454545, "acc_stderr": 0.04554619617541054, "acc_norm": 0.6545454545454545, "acc_norm_stderr": 0.04554619617541054 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7224489795918367, "acc_stderr": 0.028666857790274648, "acc_norm": 0.7224489795918367, "acc_norm_stderr": 0.028666857790274648 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8507462686567164, "acc_stderr": 0.02519692987482708, "acc_norm": 0.8507462686567164, "acc_norm_stderr": 0.02519692987482708 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.034873508801977704, "acc_norm": 0.86, "acc_norm_stderr": 0.034873508801977704 }, "harness|hendrycksTest-virology|5": { "acc": 0.5421686746987951, "acc_stderr": 0.0387862677100236, "acc_norm": 0.5421686746987951, "acc_norm_stderr": 0.0387862677100236 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8070175438596491, "acc_stderr": 0.030267457554898458, "acc_norm": 0.8070175438596491, "acc_norm_stderr": 0.030267457554898458 }, "harness|truthfulqa:mc|0": { "mc1": 0.2729498164014688, "mc1_stderr": 0.015594753632006526, "mc2": 0.4177755099939499, "mc2_stderr": 0.014043002514832767 }, "harness|winogrande|5": { "acc": 0.7861089187056038, "acc_stderr": 0.011524466954090254 }, "harness|gsm8k|5": { "acc": 0.4048521607278241, "acc_stderr": 0.013520817666870496 } }
配置详情
-
harness_arc_challenge_25
- 分割:2024_03_03T06_03_37.474849, latest
- 路径:
**/details_harness|arc:challenge|25_2024-03-03T06-03-37.474849.parquet
-
harness_gsm8k_5
- 分割:2024_03_03T06_03_37.474849, latest
- 路径:
**/details_harness|gsm8k|5_2024-03-03T06-03-37.474849.parquet
-
harness_hellaswag_10
- 分割:2024_03_03T06_03_37.474849, latest
- 路径:
**/details_harness|hellaswag|10_2024-03-03T06-03-37.474849.parquet
-
harness_hendrycksTest_5
- 分割:2024_03_03T06_03_37.474849, latest
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-anatomy|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-astronomy|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-college_biology|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-college_physics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-computer_security|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-econometrics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-formal_logic|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-global_facts|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_biology|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_chemistry|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_computer_science|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_european_history|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_geography|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_government_and_politics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_macroeconomics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_mathematics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_microeconomics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_physics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_psychology|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_statistics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_us_history|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-high_school_world_history|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-human_aging|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-human_sexuality|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-international_law|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-jurisprudence|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-logical_fallacies|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-machine_learning|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-management|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-marketing|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-medical_genetics|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-miscellaneous|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-moral_disputes|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-moral_scenarios|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-nutrition|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-philosophy|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-prehistory|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-professional_accounting|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-professional_law|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-professional_medicine|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-professional_psychology|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-public_relations|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-security_studies|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-sociology|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-us_foreign_policy|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-virology|5_2024-03-03T06-03-37.474849.parquet**/details_harness|hendrycksTest-world_religions|5_2024-03-03T06-03-37.474849.parquet



