open-llm-leaderboard-old/details_NeverSleep__MiquMaid-v2-70B
收藏数据集概述
该数据集是在对模型NeverSleep/MiquMaid-v2-70B进行评估运行期间自动创建的,用于Open LLM Leaderboard。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_NeverSleep__MiquMaid-v2-70B", "harness_winogrande_5", split="train")
最新结果
以下是2024-02-10T00:32:33.035369的最新结果:
python { "all": { "acc": 0.7408024702977796, "acc_stderr": 0.02851656534172317, "acc_norm": 0.753086165386141, "acc_norm_stderr": 0.02910913441413026, "mc1": 0.40758873929008566, "mc1_stderr": 0.017201949234553104, "mc2": 0.5762261950802, "mc2_stderr": 0.014578620162618537 }, "harness|arc:challenge|25": { "acc": 0.6459044368600683, "acc_stderr": 0.013975454122756565, "acc_norm": 0.7047781569965871, "acc_norm_stderr": 0.013329750293382316 }, "harness|hellaswag|10": { "acc": 0.6868153754232225, "acc_stderr": 0.0046284090842187596, "acc_norm": 0.8749253136825333, "acc_norm_stderr": 0.003301275117987939 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6962962962962963, "acc_stderr": 0.03972552884785136, "acc_norm": 0.6962962962962963, "acc_norm_stderr": 0.03972552884785136 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.8355263157894737, "acc_stderr": 0.03016753346863271, "acc_norm": 0.8355263157894737, "acc_norm_stderr": 0.03016753346863271 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.78, "acc_stderr": 0.04163331998932261, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932261 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.8150943396226416, "acc_stderr": 0.023893351834464317, "acc_norm": 0.8150943396226416, "acc_norm_stderr": 0.023893351834464317 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.9027777777777778, "acc_stderr": 0.02477451625044016, "acc_norm": 0.9027777777777778, "acc_norm_stderr": 0.02477451625044016 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7572254335260116, "acc_stderr": 0.0326926380614177, "acc_norm": 0.7572254335260116, "acc_norm_stderr": 0.0326926380614177 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4803921568627451, "acc_stderr": 0.04971358884367406, "acc_norm": 0.4803921568627451, "acc_norm_stderr": 0.04971358884367406 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7489361702127659, "acc_stderr": 0.02834696377716245, "acc_norm": 0.7489361702127659, "acc_norm_stderr": 0.02834696377716245 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5964912280701754, "acc_stderr": 0.04615186962583707, "acc_norm": 0.5964912280701754, "acc_norm_stderr": 0.04615186962583707 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7103448275862069, "acc_stderr": 0.03780019230438015, "acc_norm": 0.7103448275862069, "acc_norm_stderr": 0.03780019230438015 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.5264550264550265, "acc_stderr": 0.02571523981134675, "acc_norm": 0.5264550264550265, "acc_norm_stderr": 0.02571523981134675 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5317460317460317, "acc_stderr": 0.04463112720677173, "acc_norm": 0.5317460317460317, "acc_norm_stderr": 0.04463112720677173 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8741935483870967, "acc_stderr": 0.018865834288030008, "acc_norm": 0.8741935483870967, "acc_norm_stderr": 0.018865834288030008 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6157635467980296, "acc_stderr": 0.034223985656575515, "acc_norm": 0.6157635467980296, "acc_norm_stderr": 0.034223985656575515 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.86, "acc_stderr": 0.03487350880197769, "acc_norm": 0.86, "acc_norm_stderr": 0.03487350880197769 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8363636363636363, "acc_stderr": 0.02888787239548795, "acc_norm": 0.8363636363636363, "acc_norm_stderr": 0.02888787239548795 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9242424242424242, "acc_stderr": 0.0188526702349931, "acc_norm": 0.9242424242424242, "acc_norm_stderr": 0.0188526702349931 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9430051813471503, "acc_stderr": 0.01673108529360756, "acc_norm": 0.9430051813471503, "acc_norm_stderr": 0.01673108529360756 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7923076923076923, "acc_stderr": 0.02056753956724681, "acc_norm": 0.7923076923076923, "acc_norm_stderr": 0.02056753956724681 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.4222222222222222, "acc_stderr": 0.030114442019668095, "acc_norm": 0.4222222222222222, "acc_norm_stderr": 0.030114442019668095 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.865546218487395, "acc_stderr": 0.022159373072744442, "acc_norm": 0.865546218487395, "acc_norm_stderr": 0.022159373072744442 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.5496688741721855, "acc_stderr": 0.04062290018683775, "acc_norm": 0.5496688741721855, "acc_norm_stderr": 0.04062290018683775 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.9137614678899083, "acc_stderr": 0.012035597300116245, "acc_norm": 0.9137614678899083, "acc_norm_stderr": 0.012035597300116245 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.6990740740740741, "acc_stderr": 0.03128039084329881, "acc_norm": 0.6990740740740741, "acc_norm_stderr": 0.03128039084329881 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.9264705882352942, "acc_stderr": 0.018318855850089678, "acc_norm": 0.9264705882352942, "acc_norm_stderr": 0.018318855850089678 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.9156118143459916, "acc_stderr": 0.018094247116473335, "acc_norm": 0.9156118143459916, "acc_norm_stderr": 0.018094247116473335 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.8295964125560538, "acc_stderr": 0.02523459344713617, "acc_norm": 0.8295964125560538, "acc_norm_stderr": 0.02523459344713617 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.8702290076335878, "acc_stderr": 0.029473649496907065, "acc_norm": 0.8702290076335878, "acc_norm_stderr": 0.029473649496907065 }, "harness|hendrycksTest-international_law|5": { "acc": 0.9008264462809917, "acc_stderr": 0.027285246312758957, "acc_norm": 0.9008264462809917, "acc_norm_stderr": 0.027285246312758957 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.8888888888888888, "acc_stderr": 0.03038159675665167, "acc_norm": 0.8888888888888888, "acc_norm_stderr": 0.03038159675665167 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.8220858895705522, "acc_stderr": 0.03004735765580662, "acc_norm": 0.8220858895705522, "acc_norm_stderr": 0.03004735765580662 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.6607142857142857, "acc_stderr": 0.04493949068613539, "acc_norm": 0.6607142857142857, "acc_norm_stderr": 0.04493949068613539 }, "harness|hendrycksTest-management|5": { "acc": 0.8737864077669902, "acc_stderr": 0.03288180278808629, "acc_norm": 0.8737864077669902, "acc_norm_stderr": 0.03288180278808629 }, "harness|hendrycksTest-marketing|5": { "acc": 0.9188034188034188, "acc_stderr": 0.01789378490401853, "acc_norm": 0.9188034188034188, "acc_norm_stderr": 0.01789378490401853 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.8, "acc_stderr": 0.040201512610368445, "acc_norm": 0.8, "acc_norm_stderr": 0.040201512610368445 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8978288633461047, "acc_stderr": 0.01083072471313418, "acc_norm": 0.8978288633461047, "acc_norm_stderr": 0.01083072471313418 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.8265895953757225, "acc_stderr": 0.020383229551135005, "acc_norm": 0.8265895953757225, "acc_norm_stderr": 0.020383229551135005 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.6011173184357542, "acc_stderr": 0.016376966142610073, "acc_norm": 0.6011173184357542, "acc_norm_stderr": 0.016376966142610073 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.8235294117647058, "acc_stderr": 0.021828596053108416, "acc_norm": 0.8235294117647058, "acc_norm_stderr": 0.021828596053108416 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.8263665594855305, "acc_stderr": 0.0215140515859704, "acc_norm": 0.8263665594855305, "acc_norm_stderr": 0.0215140515859704 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.8518518518518519, "acc_stderr": 0.019766459563597252, "acc_norm": 0.8518518518518519, "acc_norm_stderr": 0.019766459563597252 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.599290780141844, "acc_stderr": 0.029233465745573093, "acc_norm": 0.599290780141844, "acc_norm_stderr": 0.029233465745573093 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.5840938722294654, "acc_stderr": 0.01258832385031359, "acc_norm": 0.5840938722294654, "acc_norm_stderr": 0.01258832385031359 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.8014705882352942, "acc_stderr": 0.024231013370541087, "acc_norm": 0.8014705882352942, "acc_norm_stderr": 0.024231013370541087 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.815359477124183, "acc_stderr": 0.015697029240757783, "acc_norm": 0.815359477124183, "acc_norm_stderr": 0.015697029240757783 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.7090909090909091, "acc_stderr": 0.04350271442923243, "acc_norm": 0.7090909090909091, "acc_norm_stderr": 0.04350271442923243 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.8408163265306122, "acc_stderr": 0.023420972069166344, "acc_norm": 0.8408163265306122, "acc_norm_stderr": 0.023420972069166344 }, "harness|hendrycksTest-sociology|5": { "acc": 0.9253731343283582, "acc_stderr": 0.01858193969849063, "acc_norm": 0.9253731343283582, "acc_norm_stderr": 0.01858193969849063 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.94, "acc_stderr": 0.02386832565759418, "acc_norm": 0.94, "acc_norm_stderr": 0.02386832565759418 }, "harness|hendrycksTest-virology|5": { "acc": 0.5662650602409639, "acc_stderr": 0.03858158940685515, "acc_norm": 0.5662650602409639, "acc_norm_stderr": 0.03858158940685515 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8888888888888888, "acc_stderr": 0.024103384202072864, "acc_norm": 0.8888888888888888, "acc_norm_stderr": 0.024103384202072864 }, "harness|truthfulqa:mc|0": { "mc1": 0.40758873929008566, "mc1_stderr": 0.017201949234553104, "mc2": 0.5762261950802, "mc2_stderr": 0.014578620162618537 }, "harness|winogrande|5": { "acc": 0.8476716653512234, "acc_stderr": 0.010099208246065583 }, "harness|gsm8k|5": { "acc": 0.1561789234268385, "acc_stderr": 0.009999509369757457 } }



