open-llm-leaderboard-old/details_uukuguy__speechless-coder-ds-6.7b
收藏数据集概述
数据集创建背景
该数据集是在对模型 uukuguy/speechless-coder-ds-6.7b 进行评估运行期间自动创建的,评估结果展示在 Open LLM Leaderboard 上。
数据集结构
- 配置数量:数据集包含 63 个配置,每个配置对应一个评估任务。
- 运行次数:数据集从 1 次运行中创建。每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- 训练分割:"train" 分割始终指向最新的结果。
- 结果配置:一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_uukuguy__speechless-coder-ds-6.7b", "harness_winogrande_5", split="train")
最新结果
以下是 2023-12-30T07:08:30.796108 运行的最新结果:
python { "all": { "acc": 0.38073989952019327, "acc_stderr": 0.03433559818958823, "acc_norm": 0.38307431216916843, "acc_norm_stderr": 0.0350891686808636, "mc1": 0.2607099143206854, "mc1_stderr": 0.015368841620766373, "mc2": 0.4167302788975791, "mc2_stderr": 0.014552137962691033 }, "harness|arc:challenge|25": { "acc": 0.3378839590443686, "acc_stderr": 0.013822047922283516, "acc_norm": 0.36860068259385664, "acc_norm_stderr": 0.014097810678042185 }, "harness|hellaswag|10": { "acc": 0.40300736904999, "acc_stderr": 0.0048949977367190485, "acc_norm": 0.5245966938856802, "acc_norm_stderr": 0.004983740145218606 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.34074074074074073, "acc_stderr": 0.04094376269996794, "acc_norm": 0.34074074074074073, "acc_norm_stderr": 0.04094376269996794 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.3157894736842105, "acc_stderr": 0.03782728980865469, "acc_norm": 0.3157894736842105, "acc_norm_stderr": 0.03782728980865469 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.42641509433962266, "acc_stderr": 0.030437794342983045, "acc_norm": 0.42641509433962266, "acc_norm_stderr": 0.030437794342983045 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.3541666666666667, "acc_stderr": 0.039994111357535424, "acc_norm": 0.3541666666666667, "acc_norm_stderr": 0.039994111357535424 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3699421965317919, "acc_stderr": 0.036812296333943194, "acc_norm": 0.3699421965317919, "acc_norm_stderr": 0.036812296333943194 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.23529411764705882, "acc_stderr": 0.04220773659171453, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.39574468085106385, "acc_stderr": 0.03196758697835361, "acc_norm": 0.39574468085106385, "acc_norm_stderr": 0.03196758697835361 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2631578947368421, "acc_stderr": 0.041424397194893624, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.041424397194893624 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.43448275862068964, "acc_stderr": 0.04130740879555498, "acc_norm": 0.43448275862068964, "acc_norm_stderr": 0.04130740879555498 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.30158730158730157, "acc_stderr": 0.0236369759961018, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.0236369759961018 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3253968253968254, "acc_stderr": 0.04190596438871136, "acc_norm": 0.3253968253968254, "acc_norm_stderr": 0.04190596438871136 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.4161290322580645, "acc_stderr": 0.028040981380761543, "acc_norm": 0.4161290322580645, "acc_norm_stderr": 0.028040981380761543 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2512315270935961, "acc_stderr": 0.030516530732694436, "acc_norm": 0.2512315270935961, "acc_norm_stderr": 0.030516530732694436 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.3333333333333333, "acc_stderr": 0.03681050869161549, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.03681050869161549 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.42424242424242425, "acc_stderr": 0.03521224908841583, "acc_norm": 0.42424242424242425, "acc_norm_stderr": 0.03521224908841583 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.39378238341968913, "acc_stderr": 0.03526077095548237, "acc_norm": 0.39378238341968913, "acc_norm_stderr": 0.03526077095548237 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.35384615384615387, "acc_stderr": 0.024243783994062164, "acc_norm": 0.35384615384615387, "acc_norm_stderr": 0.024243783994062164 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3037037037037037, "acc_stderr": 0.02803792996911499, "acc_norm": 0.3037037037037037, "acc_norm_stderr": 0.02803792996911499 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.3277310924369748, "acc_stderr": 0.030489911417673227, "acc_norm": 0.3277310924369748, "acc_norm_stderr": 0.030489911417673227 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.2980132450331126, "acc_stderr": 0.03734535676787198, "acc_norm": 0.2980132450331126, "acc_norm_stderr": 0.03734535676787198 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.3577981651376147, "acc_stderr": 0.02055206078482782, "acc_norm": 0.3577981651376147, "acc_norm_stderr": 0.02055206078482782 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.4027777777777778, "acc_stderr": 0.03344887382997867, "acc_norm": 0.4027777777777778, "acc_norm_stderr": 0.03344887382997867 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.37254901960784315, "acc_stderr": 0.03393388584958406, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.03393388584958406 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.3459915611814346, "acc_stderr": 0.03096481058878671, "acc_norm": 0.3459915611814346, "acc_norm_stderr": 0.03096481058878671 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.35874439461883406, "acc_stderr": 0.03219079200419995, "acc_norm": 0.35874439461883406, "acc_norm_stderr": 0.03219079200419995 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.48091603053435117, "acc_stderr": 0.04382094705550989, "acc_norm": 0.48091603053435117, "acc_norm_stderr": 0.04382094705550989 }, "harness|hendrycksTest-international_law|5": { "acc": 0.4380165289256198, "acc_stderr": 0.045291468044357915, "acc_norm": 0.4380165289256198, "acc_norm_stderr": 0.045291468044357915 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.3611111111111111, "acc_stderr": 0.04643454608906275, "acc_norm": 0.3611111111111111, "acc_norm_stderr": 0.04643454608906275 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.44171779141104295, "acc_stderr": 0.039015918258361836, "acc_norm": 0.44171779141104295, "acc_norm_stderr": 0.039015918258361836 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.3482142857142857, "acc_stderr": 0.04521829902833585, "acc_norm": 0.3482142857142857, "acc_norm_stderr": 0.04521829902833585 }, "harness|hendrycksTest-management|5": { "acc": 0.39805825242718446, "acc_stderr": 0.04846748253977239, "acc_norm": 0.39805825242718446, "acc_norm_stderr": 0.04846748253977239 }, "harness|hendrycksTest-marketing|5": { "acc": 0.5897435897435898, "acc_stderr": 0.03222414045241108, "acc_norm": 0.5897435897435898, "acc_norm_stderr": 0.03222414045241108 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.40485312899106, "acc_stderr": 0.017553246467720253, "acc_norm": 0.40485312899106, "acc_norm_stderr": 0.017553246467720253 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.3872832369942196, "acc_stderr": 0.02622615860512465, "acc_norm": 0.3872832369942196, "acc_norm_stderr": 0.02622615860512465 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.2871508379888268, "acc_stderr": 0.015131608849963729, "acc_norm": 0.2871508379888268, "acc_norm_stderr": 0.015131608849963729 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.3888888888888889, "acc_stderr": 0.02791405551046802, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.02791405551046802 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.3729903536977492, "acc_stderr": 0.027466610213140105, "acc_norm": 0.3729903536977492, "acc_norm_stderr": 0.027466610213140105 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.32407407407407407, "acc_stderr": 0.026041766202717167, "acc_norm": 0.32407407407407407, "acc_norm_stderr": 0.026041766202717167 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.29432624113475175, "acc_stderr": 0.0271871270115038, "acc_norm": 0.29432624113475175, "acc_norm_stderr": 0.0271871270115038 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.30182529335071706, "acc_stderr": 0.01172435051810589, "acc_norm": 0.30182529335071706, "acc_norm_stderr": 0.01172435051810589 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.3235294117647059, "acc_stderr": 0.028418208619406787, "acc_norm": 0.3235294117647059, "acc_norm_stderr": 0.028418208619406787 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.3088235294117647, "acc_stderr": 0.01869085027359528, "acc_norm": 0.3088235294117647, "acc_norm_stderr": 0.01869085027359528 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.5, "acc_stderr": 0.04789131426105757, "acc_norm": 0.5, "acc_norm_stderr": 0.04789131426105757 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.4326530612244898, "acc_stderr": 0.031717528240626645, "acc_norm": 0.4326530612244898, "acc_norm_stderr": 0.031717528240626645 }, "harness|hendrycksTest-sociology|5": { "acc": 0.36318407960199006, "acc_stderr": 0.034005985055990146, "acc_norm": 0.36318407960199006, "acc_norm_stderr": 0.034005985055990146 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-virology|5": { "acc": 0.41566265060240964, "acc_stderr": 0.038367221765980515, "acc_norm": 0.41566265060240964, "acc_norm_stderr": 0.038367221765980515 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.36257309941520466, "acc_stderr": 0.036871306155620606, "acc_norm": 0.36257309941520466, "acc_norm_stderr": 0.036871306155620606 }, "harness|truthfulqa:mc|0": { "mc1": 0.2607099143206854, "mc1_stderr": 0.015368841620766373, "mc2": 0.4167302788975791, "mc2_stderr": 0.014552137962691033 }, "harness|winogrande|5": { "acc": 0.5887924230465666, "acc_stderr": 0.013829128358676876 }, "harness|gsm8k|5": { "acc": 0.18726307808946172, "acc_stderr": 0.010745914199510825 } }
配置详情
-
配置名称:
harness_arc_challenge_25- 数据文件:
- 分割:
2023_12_30T07_08_30.796108- 路径:
**/details_harness|arc:challenge|25_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
latest- 路径:
**/details_harness|arc:challenge|25_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
- 数据文件:
-
配置名称:
harness_gsm8k_5- 数据文件:
- 分割:
2023_12_30T07_08_30.796108- 路径:
**/details_harness|gsm8k|5_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
latest- 路径:
**/details_harness|gsm8k|5_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
- 数据文件:
-
配置名称:
harness_hellaswag_10- 数据文件:
- 分割:
2023_12_30T07_08_30.796108- 路径:
**/details_harness|hellaswag|10_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
latest- 路径:
**/details_harness|hellaswag|10_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
- 数据文件:
-
配置名称:
harness_hendrycksTest_5- 数据文件:
- 分割:
2023_12_30T07_08_30.796108- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-anatomy|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-astronomy|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-business_ethics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-college_biology|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-college_chemistry|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-college_computer_science|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-college_mathematics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-college_medicine|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-college_physics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-computer_security|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-econometrics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-formal_logic|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-global_facts|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_biology|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_chemistry|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_computer_science|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_european_history|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_geography|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_government_and_politics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_macroeconomics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_mathematics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_microeconomics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_physics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_psychology|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_statistics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_us_history|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-high_school_world_history|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-human_aging|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-human_sexuality|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-international_law|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-jurisprudence|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-logical_fallacies|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-machine_learning|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-management|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-marketing|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-medical_genetics|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-miscellaneous|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-moral_disputes|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-moral_scenarios|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-nutrition|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-philosophy|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-prehistory|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-professional_accounting|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-professional_law|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-professional_medicine|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-professional_psychology|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-public_relations|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-security_studies|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-sociology|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-us_foreign_policy|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-virology|5_2023-12-30T07-08-30.796108.parquet**/details_harness|hendrycksTest-world_religions|5_2023-12-30T07-08-30.796108.parquet
- 路径:
- 分割:
- 数据文件:




