open-llm-leaderboard-old/details_kuotient__Seagull-Llama-3-8B-orpo-v0.1
收藏数据集概述
该数据集是在评估模型 kuotient/Seagull-Llama-3-8B-orpo-v0.1 在 Open LLM Leaderboard 上的运行过程中自动创建的。数据集包含63个配置,每个配置对应一个评估任务。
数据集结构
- 配置数量:63个配置
- 数据来源:1次运行结果
- 数据分割:每个配置包含特定运行的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
- 额外配置:"results" 配置存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_kuotient__Seagull-Llama-3-8B-orpo-v0.1", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-20T15:46:04.199778 运行的最新结果:
python { "all": { "acc": 0.6504130201404981, "acc_stderr": 0.03221414312609127, "acc_norm": 0.6554584609277425, "acc_norm_stderr": 0.03285716056701538, "mc1": 0.37454100367197063, "mc1_stderr": 0.016943535128405338, "mc2": 0.5488794060695712, "mc2_stderr": 0.014798227306290622 }, "harness|arc:challenge|25": { "acc": 0.5435153583617748, "acc_stderr": 0.014555949760496439, "acc_norm": 0.5861774744027304, "acc_norm_stderr": 0.014392730009221009 }, "harness|hellaswag|10": { "acc": 0.6148177653853814, "acc_stderr": 0.0048564379557198565, "acc_norm": 0.8174666401115316, "acc_norm_stderr": 0.0038549403270910316 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6666666666666666, "acc_stderr": 0.04072314811876837, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.04072314811876837 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6644736842105263, "acc_stderr": 0.038424985593952694, "acc_norm": 0.6644736842105263, "acc_norm_stderr": 0.038424985593952694 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7358490566037735, "acc_stderr": 0.027134291628741695, "acc_norm": 0.7358490566037735, "acc_norm_stderr": 0.027134291628741695 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7361111111111112, "acc_stderr": 0.03685651095897532, "acc_norm": 0.7361111111111112, "acc_norm_stderr": 0.03685651095897532 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6416184971098265, "acc_stderr": 0.036563436533531585, "acc_norm": 0.6416184971098265, "acc_norm_stderr": 0.036563436533531585 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.47058823529411764, "acc_stderr": 0.04966570903978529, "acc_norm": 0.47058823529411764, "acc_norm_stderr": 0.04966570903978529 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.574468085106383, "acc_stderr": 0.03232146916224468, "acc_norm": 0.574468085106383, "acc_norm_stderr": 0.03232146916224468 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4473684210526316, "acc_stderr": 0.04677473004491199, "acc_norm": 0.4473684210526316, "acc_norm_stderr": 0.04677473004491199 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5862068965517241, "acc_stderr": 0.04104269211806232, "acc_norm": 0.5862068965517241, "acc_norm_stderr": 0.04104269211806232 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42857142857142855, "acc_stderr": 0.025487187147859372, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.025487187147859372 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.38, "acc_stderr": 0.04878317312145632, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7709677419354839, "acc_stderr": 0.02390491431178265, "acc_norm": 0.7709677419354839, "acc_norm_stderr": 0.02390491431178265 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5221674876847291, "acc_stderr": 0.03514528562175008, "acc_norm": 0.5221674876847291, "acc_norm_stderr": 0.03514528562175008 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8232323232323232, "acc_stderr": 0.027178752639044915, "acc_norm": 0.8232323232323232, "acc_norm_stderr": 0.027178752639044915 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8860103626943006, "acc_stderr": 0.022935144053919436, "acc_norm": 0.8860103626943006, "acc_norm_stderr": 0.022935144053919436 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6384615384615384, "acc_stderr": 0.024359581465396993, "acc_norm": 0.6384615384615384, "acc_norm_stderr": 0.024359581465396993 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3962962962962963, "acc_stderr": 0.029822619458533997, "acc_norm": 0.3962962962962963, "acc_norm_stderr": 0.029822619458533997 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6890756302521008, "acc_stderr": 0.030066761582977917, "acc_norm": 0.6890756302521008, "acc_norm_stderr": 0.030066761582977917 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.423841059602649, "acc_stderr": 0.04034846678603397, "acc_norm": 0.423841059602649, "acc_norm_stderr": 0.04034846678603397 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8293577981651377, "acc_stderr": 0.016129271025099853, "acc_norm": 0.8293577981651377, "acc_norm_stderr": 0.016129271025099853 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5787037037037037, "acc_stderr": 0.03367462138896078, "acc_norm": 0.5787037037037037, "acc_norm_stderr": 0.03367462138896078 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8137254901960784, "acc_stderr": 0.027325470966716323, "acc_norm": 0.8137254901960784, "acc_norm_stderr": 0.027325470966716323 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8143459915611815, "acc_stderr": 0.025310495376944863, "acc_norm": 0.8143459915611815, "acc_norm_stderr": 0.025310495376944863 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.726457399103139, "acc_stderr": 0.029918586707798827, "acc_norm": 0.726457399103139, "acc_norm_stderr": 0.029918586707798827 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7786259541984732, "acc_stderr": 0.03641297081313732, "acc_norm": 0.7786259541984732, "acc_norm_stderr": 0.03641297081313732 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7603305785123967, "acc_stderr": 0.03896878985070416, "acc_norm": 0.7603305785123967, "acc_norm_stderr": 0.03896878985070416 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.75, "acc_stderr": 0.04186091791394607, "acc_norm": 0.75, "acc_norm_stderr": 0.04186091791394607 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7239263803680982, "acc_stderr": 0.035123852837050475, "acc_norm": 0.7239263803680982, "acc_norm_stderr": 0.035123852837050475 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.5357142857142857, "acc_stderr": 0.04733667890053756, "acc_norm": 0.5357142857142857, "acc_norm_stderr": 0.04733667890053756 }, "harness|hendrycksTest-management|5": { "acc": 0.8543689320388349, "acc_stderr": 0.034926064766237906, "acc_norm": 0.8543689320388349, "acc_norm_stderr": 0.034926064766237906 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8547008547008547, "acc_stderr": 0.023086635086841407, "acc_norm": 0.8547008547008547, "acc_norm_stderr": 0.023086635086841407 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.8, "acc_stderr": 0.040201512610368445, "acc_norm": 0.8, "acc_norm_stderr": 0.040201512610368445 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8199233716475096, "acc_stderr": 0.013740797258579825, "acc_norm": 0.8199233716475096, "acc_norm_stderr": 0.013740797258579825 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.6994219653179191, "acc_stderr": 0.024685316867257796, "acc_norm": 0.6994219653179191, "acc_norm_stderr": 0.024685316867257796 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.37206703910614525, "acc_stderr": 0.016165847583563295, "acc_norm": 0.37206703910614525, "acc_norm_stderr": 0.016165847583563295 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7418300653594772, "acc_stderr": 0.02505850331695814, "acc_norm": 0.7418300653594772, "acc_norm_stderr": 0.02505850331695814 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7170418006430869, "acc_stderr": 0.02558306248998481, "acc_norm": 0.7170418006430869, "acc_norm_stderr": 0.02558306248998481 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7129629629629629, "acc_stderr": 0.02517104191530968, "acc_norm": 0.7129629629629629, "acc_norm_stderr": 0.02517104191530968 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.49645390070921985, "acc_stderr": 0.02982674915328092, "acc_norm": 0.49645390070921985, "acc_norm_stderr": 0.02982674915328092 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.45632333767926986, "acc_stderr": 0.012721420501462544, "acc_norm": 0.45632333767926986, "acc_norm_stderr": 0.012721420501462544 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.7205882352941176, "acc_stderr": 0.02725720260611494, "acc_norm": 0.7205882352941176, "acc_norm_stderr": 0.02725720260611494 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6797385620915033, "acc_stderr": 0.018875682938069443, "acc_norm": 0.6797385620915033, "acc_norm_stderr": 0.018875682938069443 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.7090909090909091, "acc_stderr": 0.04350271442923243, "acc_norm": 0.7090909090909091, "acc_norm_stderr": 0.04350271442923243 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7306122448979592, "acc_stderr": 0.02840125202902294, "acc_norm": 0.7306122448979592, "acc_norm_stderr": 0.02840125202902294 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8656716417910447, "acc_stderr": 0.024112678240900798, "acc_norm": 0.8656716417910447, "acc_norm_stderr": 0.024112678240900798 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.83, "acc_stderr": 0.0377525168068637, "acc_norm": 0.83, "acc_norm_stderr": 0.0377525168068637 }, "harness|hendrycksTest-virology|5": { "acc": 0.536144578313253, "acc_stderr": 0.038823108508905954, "acc_norm": 0.536144578313253, "acc_norm_stderr": 0.038823108508905954 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8304093567251462, "acc_stderr": 0.028782108105401705, "acc_norm": 0.8304093567251462, "acc_norm_stderr": 0.028782108105401705 }, "harness|truthfulqa:mc|0": { "mc1": 0.37454100367197063, "mc1_stderr": 0.016943535128405338, "mc2": 0.5488794060695712, "mc2_stderr": 0.014798227306290622 }, "harness|winogrande|5": { "acc": 0.7829518547750592, "acc_stderr": 0.011585871710209411 }, "harness|gsm8k|5": { "acc": 0.4655041698256255, "acc_stderr": 0.013739668147545916 } }



