open-llm-leaderboard-old/details_robinsmits__Qwen1.5-7B-Dutch-Chat-Dpo
收藏数据集概述
数据集简介
该数据集是在对模型 robinsmits/Qwen1.5-7B-Dutch-Chat-Dpo 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_robinsmits__Qwen1.5-7B-Dutch-Chat-Dpo", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-29T19:00:48.502941 运行的最新结果:
python { "all": { "acc": 0.5997482675075448, "acc_stderr": 0.03320488412208406, "acc_norm": 0.6076095915276317, "acc_norm_stderr": 0.033890186938401776, "mc1": 0.27906976744186046, "mc1_stderr": 0.01570210709062791, "mc2": 0.42373630767195636, "mc2_stderr": 0.014665475633177178 }, "harness|arc:challenge|25": { "acc": 0.4786689419795222, "acc_stderr": 0.014598087973127106, "acc_norm": 0.507679180887372, "acc_norm_stderr": 0.01460966744089257 }, "harness|hellaswag|10": { "acc": 0.5514837681736706, "acc_stderr": 0.004963259311700567, "acc_norm": 0.7423819956184027, "acc_norm_stderr": 0.004364287353415457 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5111111111111111, "acc_stderr": 0.04318275491977978, "acc_norm": 0.5111111111111111, "acc_norm_stderr": 0.04318275491977978 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6644736842105263, "acc_stderr": 0.03842498559395269, "acc_norm": 0.6644736842105263, "acc_norm_stderr": 0.03842498559395269 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.67, "acc_stderr": 0.04725815626252607, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252607 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.660377358490566, "acc_stderr": 0.029146904747798328, "acc_norm": 0.660377358490566, "acc_norm_stderr": 0.029146904747798328 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7013888888888888, "acc_stderr": 0.03827052357950756, "acc_norm": 0.7013888888888888, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.48, "acc_stderr": 0.05021167315686779, "acc_norm": 0.48, "acc_norm_stderr": 0.05021167315686779 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5953757225433526, "acc_stderr": 0.03742461193887248, "acc_norm": 0.5953757225433526, "acc_norm_stderr": 0.03742461193887248 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.37254901960784315, "acc_stderr": 0.04810840148082635, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.04810840148082635 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5276595744680851, "acc_stderr": 0.03263597118409769, "acc_norm": 0.5276595744680851, "acc_norm_stderr": 0.03263597118409769 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4473684210526316, "acc_stderr": 0.04677473004491199, "acc_norm": 0.4473684210526316, "acc_norm_stderr": 0.04677473004491199 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6068965517241379, "acc_stderr": 0.040703290137070705, "acc_norm": 0.6068965517241379, "acc_norm_stderr": 0.040703290137070705 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.47619047619047616, "acc_stderr": 0.025722097064388535, "acc_norm": 0.47619047619047616, "acc_norm_stderr": 0.025722097064388535 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4444444444444444, "acc_stderr": 0.04444444444444449, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.04444444444444449 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7032258064516129, "acc_stderr": 0.0259885007924119, "acc_norm": 0.7032258064516129, "acc_norm_stderr": 0.0259885007924119 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.541871921182266, "acc_stderr": 0.03505630140785742, "acc_norm": 0.541871921182266, "acc_norm_stderr": 0.03505630140785742 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7777777777777778, "acc_stderr": 0.02962022787479048, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.02962022787479048 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8134715025906736, "acc_stderr": 0.02811209121011745, "acc_norm": 0.8134715025906736, "acc_norm_stderr": 0.02811209121011745 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5820512820512821, "acc_stderr": 0.02500732988246122, "acc_norm": 0.5820512820512821, "acc_norm_stderr": 0.02500732988246122 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34444444444444444, "acc_stderr": 0.02897264888484427, "acc_norm": 0.34444444444444444, "acc_norm_stderr": 0.02897264888484427 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.5966386554621849, "acc_stderr": 0.03186608121408832, "acc_norm": 0.5966386554621849, "acc_norm_stderr": 0.03186608121408832 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.37748344370860926, "acc_stderr": 0.0395802723112157, "acc_norm": 0.37748344370860926, "acc_norm_stderr": 0.0395802723112157 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.7926605504587156, "acc_stderr": 0.01738141556360868, "acc_norm": 0.7926605504587156, "acc_norm_stderr": 0.01738141556360868 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5046296296296297, "acc_stderr": 0.03409825519163572, "acc_norm": 0.5046296296296297, "acc_norm_stderr": 0.03409825519163572 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.75, "acc_stderr": 0.03039153369274154, "acc_norm": 0.75, "acc_norm_stderr": 0.03039153369274154 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7679324894514767, "acc_stderr": 0.02747974455080851, "acc_norm": 0.7679324894514767, "acc_norm_stderr": 0.02747974455080851 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6322869955156951, "acc_stderr": 0.03236198350928275, "acc_norm": 0.6322869955156951, "acc_norm_stderr": 0.03236198350928275 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7175572519083969, "acc_stderr": 0.03948406125768361, "acc_norm": 0.7175572519083969, "acc_norm_stderr": 0.03948406125768361 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7851239669421488, "acc_stderr": 0.03749492448709698, "acc_norm": 0.7851239669421488, "acc_norm_stderr": 0.03749492448709698 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7777777777777778, "acc_stderr": 0.0401910747255735, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.0401910747255735 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.6503067484662577, "acc_stderr": 0.03746668325470022, "acc_norm": 0.6503067484662577, "acc_norm_stderr": 0.03746668325470022 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.38392857142857145, "acc_stderr": 0.04616143075028547, "acc_norm": 0.38392857142857145, "acc_norm_stderr": 0.04616143075028547 }, "harness|hendrycksTest-management|5": { "acc": 0.8058252427184466, "acc_stderr": 0.03916667762822584, "acc_norm": 0.8058252427184466, "acc_norm_stderr": 0.03916667762822584 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8589743589743589, "acc_stderr": 0.022801382534597542, "acc_norm": 0.8589743589743589, "acc_norm_stderr": 0.022801382534597542 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.68, "acc_stderr": 0.046882617226215034, "acc_norm": 0.68, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.7535121328224776, "acc_stderr": 0.015411308769686933, "acc_norm": 0.7535121328224776, "acc_norm_stderr": 0.015411308769686933 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.653179190751445, "acc_stderr": 0.025624723994030457, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.025624723994030457 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.33743016759776534, "acc_stderr": 0.015813901283913048, "acc_norm": 0.33743016759776534, "acc_norm_stderr": 0.015813901283913048 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7091503267973857, "acc_stderr": 0.02600480036395213, "acc_norm": 0.7091503267973857, "acc_norm_stderr": 0.02600480036395213 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6688102893890675, "acc_stderr": 0.02673062072800491, "acc_norm": 0.6688102893890675, "acc_norm_stderr": 0.02673062072800491 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.6203703703703703, "acc_stderr": 0.027002521034516478, "acc_norm": 0.6203703703703703, "acc_norm_stderr": 0.027002521034516478 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.38652482269503546, "acc_stderr": 0.029049190342543454, "acc_norm": 0.38652482269503546, "acc_norm_stderr": 0.029049190342543454 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.43285528031290743, "acc_stderr": 0.012654565234622868, "acc_norm": 0.43285528031290743, "acc_norm_stderr": 0.012654565234622868 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.5367647058823529, "acc_stderr": 0.030290619180485683, "acc_norm": 0.5367647058823529, "acc_norm_stderr": 0.030290619180485683 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.5833333333333334, "acc_stderr": 0.01994491413687358, "acc_norm": 0.5833333333333334, "acc_norm_stderr": 0.01994491413687358 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6272727272727273, "acc_stderr": 0.04631381319425465, "acc_norm": 0.6272727272727273, "acc_norm_stderr": 0.04631381319425465 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.6857142857142857, "acc_stderr": 0.02971932942241748, "acc_norm": 0.6857142857142857, "acc_norm_stderr": 0.02971932942241748 }, "harness|hendrycksTest-sociology|5": { "acc": 0.7860696517412935, "acc_stderr": 0.028996909693328913, "acc_norm": 0.7860696517412935, "acc_norm_stderr": 0.028996909693328913 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.84, "acc_stderr": 0.03684529491774708, "acc_norm": 0.84, "acc_norm_stderr": 0.03684529491774708 }, "harness|hendrycksTest-virology|5": { "acc": 0.5, "acc_stderr": 0.03892494720807614, "acc_norm": 0.5, "acc_norm_stderr": 0.03892494720807614 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.7485380116959064, "acc_stderr": 0.033275044238468436, "acc_norm": 0.7485380116959064, "acc_norm_stderr": 0.033275044238468436 }, "harness|truthfulqa:mc|0": { "mc1": 0.27906976744186046, "mc1_stderr": 0.01570210709062791, "mc2": 0.42373630767195636, "mc2_stderr": 0.014665475633177178 }, "harness|winogrande|5": { "acc": 0.681136543014996, "acc_stderr": 0.01309792842008877 }, "harness|gsm8k|5": { "acc": 0.2744503411675512, "acc_stderr": 0.012291581170814893 } }



