open-llm-leaderboard-old/details_saltlux__luxia-21.4b-alignment-v0.4
收藏数据集概述
数据集简介
该数据集是在评估模型 saltlux/luxia-21.4b-alignment-v0.4 的过程中自动创建的,用于 Open LLM Leaderboard 的评估。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 数据来源:数据集由1次运行生成,每个运行的结果存储在特定的分割中,分割名称使用运行的时间戳。
- 最新结果:"train" 分割始终指向最新的结果。
- 结果汇总:一个额外的配置 "results" 存储所有运行的汇总结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_saltlux__luxia-21.4b-alignment-v0.4", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-11T19:32:53.452866 运行的最新结果:
python { "all": { "acc": 0.6863789863021343, "acc_stderr": 0.031444086687144476, "acc_norm": 0.6860944038337341, "acc_norm_stderr": 0.03210403094277028, "mc1": 0.6352509179926561, "mc1_stderr": 0.016850961061720137, "mc2": 0.7671915273948061, "mc2_stderr": 0.01385022212840208 }, "harness|arc:challenge|25": { "acc": 0.7636518771331058, "acc_stderr": 0.012414960524301823, "acc_norm": 0.7687713310580204, "acc_norm_stderr": 0.012320858834772281 }, "harness|hellaswag|10": { "acc": 0.8138816968731328, "acc_stderr": 0.0038840668811314745, "acc_norm": 0.9183429595698068, "acc_norm_stderr": 0.002732818472008806 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7697368421052632, "acc_stderr": 0.03426059424403165, "acc_norm": 0.7697368421052632, "acc_norm_stderr": 0.03426059424403165 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.73, "acc_stderr": 0.04461960433384741, "acc_norm": 0.73, "acc_norm_stderr": 0.04461960433384741 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7433962264150943, "acc_stderr": 0.026880647889051968, "acc_norm": 0.7433962264150943, "acc_norm_stderr": 0.026880647889051968 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8263888888888888, "acc_stderr": 0.03167473383795718, "acc_norm": 0.8263888888888888, "acc_norm_stderr": 0.03167473383795718 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.6, "acc_stderr": 0.049236596391733084, "acc_norm": 0.6, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6416184971098265, "acc_stderr": 0.03656343653353159, "acc_norm": 0.6416184971098265, "acc_norm_stderr": 0.03656343653353159 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.43137254901960786, "acc_stderr": 0.04928099597287534, "acc_norm": 0.43137254901960786, "acc_norm_stderr": 0.04928099597287534 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.81, "acc_stderr": 0.039427724440366234, "acc_norm": 0.81, "acc_norm_stderr": 0.039427724440366234 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.676595744680851, "acc_stderr": 0.030579442773610334, "acc_norm": 0.676595744680851, "acc_norm_stderr": 0.030579442773610334 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5614035087719298, "acc_stderr": 0.04668000738510455, "acc_norm": 0.5614035087719298, "acc_norm_stderr": 0.04668000738510455 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6413793103448275, "acc_stderr": 0.039966295748767186, "acc_norm": 0.6413793103448275, "acc_norm_stderr": 0.039966295748767186 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.5238095238095238, "acc_stderr": 0.025722097064388518, "acc_norm": 0.5238095238095238, "acc_norm_stderr": 0.025722097064388518 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4523809523809524, "acc_stderr": 0.044518079590553275, "acc_norm": 0.4523809523809524, "acc_norm_stderr": 0.044518079590553275 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8451612903225807, "acc_stderr": 0.020579287326583227, "acc_norm": 0.8451612903225807, "acc_norm_stderr": 0.020579287326583227 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5960591133004927, "acc_stderr": 0.034524539038220316, "acc_norm": 0.5960591133004927, "acc_norm_stderr": 0.034524539038220316 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8303030303030303, "acc_stderr": 0.029311188674983106, "acc_norm": 0.8303030303030303, "acc_norm_stderr": 0.029311188674983106 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8434343434343434, "acc_stderr": 0.025890520358141454, "acc_norm": 0.8434343434343434, "acc_norm_stderr": 0.025890520358141454 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8860103626943006, "acc_stderr": 0.022935144053919436, "acc_norm": 0.8860103626943006, "acc_norm_stderr": 0.022935144053919436 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7025641025641025, "acc_stderr": 0.023177408131465946, "acc_norm": 0.7025641025641025, "acc_norm_stderr": 0.023177408131465946 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3814814814814815, "acc_stderr": 0.0296167189274976, "acc_norm": 0.3814814814814815, "acc_norm_stderr": 0.0296167189274976 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.7647058823529411, "acc_stderr": 0.027553614467863804, "acc_norm": 0.7647058823529411, "acc_norm_stderr": 0.027553614467863804 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.45695364238410596, "acc_stderr": 0.04067325174247443, "acc_norm": 0.45695364238410596, "acc_norm_stderr": 0.04067325174247443 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8568807339449541, "acc_stderr": 0.015014462497168597, "acc_norm": 0.8568807339449541, "acc_norm_stderr": 0.015014462497168597 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5740740740740741, "acc_stderr": 0.03372343271653062, "acc_norm": 0.5740740740740741, "acc_norm_stderr": 0.03372343271653062 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8676470588235294, "acc_stderr": 0.02378429752091885, "acc_norm": 0.8676470588235294, "acc_norm_stderr": 0.02378429752091885 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8438818565400844, "acc_stderr": 0.023627159460318688, "acc_norm": 0.8438818565400844, "acc_norm_stderr": 0.023627159460318688 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.7443946188340808, "acc_stderr": 0.029275891003969927, "acc_norm": 0.7443946188340808, "acc_norm_stderr": 0.029275891003969927 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.6641221374045801, "acc_stderr": 0.041423137719966634, "acc_norm": 0.6641221374045801, "acc_norm_stderr": 0.041423137719966634 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8347107438016529, "acc_stderr": 0.03390780612972776, "acc_norm": 0.8347107438016529, "acc_norm_stderr": 0.03390780612972776 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7777777777777778, "acc_stderr": 0.040191074725573483, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.040191074725573483 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7484662576687117, "acc_stderr": 0.034089978868575295, "acc_norm": 0.7484662576687117, "acc_norm_stderr": 0.034089978868575295 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.5, "acc_stderr": 0.04745789978762494, "acc_norm": 0.5, "acc_norm_stderr": 0.04745789978762494 }, "harness|hendrycksTest-management|5": { "acc": 0.8252427184466019, "acc_stderr": 0.03760178006026621, "acc_norm": 0.8252427184466019, "acc_norm_stderr": 0.03760178006026621 }, "harness|hendrycksTest-marketing|5": { "acc": 0.9017094017094017, "acc_stderr": 0.019503444900757567, "acc_norm": 0.9017094017094017, "acc_norm_stderr": 0.019503444900757567 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.73, "acc_stderr": 0.044619604333847394, "acc_norm": 0.73, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.80970625798212, "acc_stderr": 0.014036945850381384, "acc_norm": 0.80970625798212, "acc_norm_stderr": 0.014036945850381384 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7109826589595376, "acc_stderr": 0.02440517393578323, "acc_norm": 0.7109826589595376, "acc_norm_stderr": 0.02440517393578323 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.45139664804469276, "acc_stderr": 0.016643307372315876, "acc_norm": 0.45139664804469276, "acc_norm_stderr": 0.016643307372315876 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7843137254901961, "acc_stderr": 0.023550831351995094, "acc_norm": 0.7843137254901961, "acc_norm_stderr": 0.023550831351995094 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7459807073954984, "acc_stderr": 0.024723861504771707, "acc_norm": 0.7459807073954984, "acc_norm_stderr": 0.024723861504771707 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7777777777777778, "acc_stderr": 0.023132376234543343, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.023132376234543343 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.5602836879432624, "acc_stderr": 0.02960991207559411, "acc_norm": 0.5602836879432624, "acc_norm_stderr": 0.02960991207559411 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.49282920469361147, "acc_stderr": 0.012768922739553308, "acc_norm": 0.49282920469361147, "acc_norm_stderr": 0.012768922739553308 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6948529411764706, "acc_stderr": 0.027971541370170595, "acc_norm": 0.6948529411764706, "acc_norm_stderr": 0.027971541370170595 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6830065359477124, "acc_stderr": 0.018824219512706207, "acc_norm": 0.6830065359477124, "acc_norm_stderr": 0.018824219512706207 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6818181818181818, "acc_stderr": 0.044612721759105085, "acc_norm": 0.6818181818181818, "acc_norm_stderr": 0.044612721759105085 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7387755102040816, "acc_stderr": 0.028123429335142783, "acc_norm": 0.7387755102040816, "acc_norm_stderr": 0.028123429335142783 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8308457711442786, "acc_stderr": 0.02650859065623327, "acc_norm": 0.8308457711442786, "acc_norm_stderr": 0.02650859065623327 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.03487350880197768, "acc_norm": 0.86, "acc_norm_stderr": 0.03487350880197768 }, "harness|hendrycksTest-virology|5": { "acc": 0.5421686746987951, "acc_stderr": 0.03878626771002361, "acc_norm": 0.5421686746987951, "acc_norm_stderr": 0.03878626771002361 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8070175438596491, "acc_stderr": 0.030267457554898458, "acc_norm": 0.8070175438596491, "acc_norm_stderr": 0.030267457554898458 }, "harness|truthfulqa:mc|0": { "mc1": 0.6352509179926561, "mc1_stderr": 0.016850961061720137, "mc2": 0.7671915273948061, "mc2_stderr": 0.01385022212840208 }, "harness|winogrande|5": { "acc": 0.8721389108129439, "acc_stderr": 0.009385235583937257 }, "harness|gsm8k|5": { "acc": 0.6269901440485216, "acc_stderr": 0.013320876609777214 } }
配置详情
-
harness_arc_challenge_25
- 分割:2024_03_11T19_32_53.452866
- 路径:
**/details_harness|arc:challenge|25_2024-03-11T19-32-53.452866.parquet - 分割:latest
- 路径:
**/details_harness|arc:challenge|25_2024-03-11T19-32-53.452866.parquet
-
harness_gsm8k_5
- 分割:2024_03_11T19_32_53.452866
- 路径:
**/details_harness|gsm8k|5_2024-03-11T19-32-53.452866.parquet - 分割:latest
- 路径:
**/details_harness|gsm8k|5_2024-03-11T19-32-53.452866.parquet
-
harness_hellaswag_10
- 分割:2024_03_11T19_32_53.452866
- 路径:
**/details_harness|hellaswag|10_2024-03-11T19-32-53.452866.parquet - 分割:latest
- 路径:
**/details_harness|hellaswag|10_2024-03-11T19-32-53.452866.parquet
-
harness_hendrycksTest_5
- 分割:2024_03_11T19_32_53.452866
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-anatomy|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-astronomy|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-college_biology|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-college_physics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-computer_security|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-econometrics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-formal_logic|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-global_facts|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_biology|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_chemistry|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_computer_science|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_european_history|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_geography|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_government_and_politics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_macroeconomics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_mathematics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_microeconomics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_physics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_psychology|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_statistics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_us_history|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-high_school_world_history|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-human_aging|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-human_sexuality|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-international_law|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-jurisprudence|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-logical_fallacies|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-machine_learning|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-management|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-marketing|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-medical_genetics|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-miscellaneous|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-moral_disputes|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-moral_scenarios|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-nutrition|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-philosophy|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-prehistory|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-professional_accounting|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-professional_law|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-professional_medicine|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-professional_psychology|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-public_relations|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-security_studies|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-sociology|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-us_foreign_policy|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-virology|5_2024-03-11T19-32-53.452866.parquet**/details_harness|hendrycksTest-world_religions|5_2024-03-11T19-32-53.452866.parquet



