open-llm-leaderboard-old/details_eren23__ogno-monarch-jaskier-merge-7b-OH-PREF-DPO-v3
收藏数据集概述
该数据集是在模型 eren23/ogno-monarch-jaskier-merge-7b-OH-PREF-DPO-v3 的评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_eren23__ogno-monarch-jaskier-merge-7b-OH-PREF-DPO-v3", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-02T17:08:29.374404 运行的最新结果:
python { "all": { "acc": 0.6538801740167637, "acc_stderr": 0.03205838652479307, "acc_norm": 0.6533974228406447, "acc_norm_stderr": 0.03272813802285965, "mc1": 0.6291309669522643, "mc1_stderr": 0.016909693580248835, "mc2": 0.7748431207842124, "mc2_stderr": 0.013793813723651245 }, "harness|arc:challenge|25": { "acc": 0.7022184300341296, "acc_stderr": 0.013363080107244482, "acc_norm": 0.7303754266211604, "acc_norm_stderr": 0.012968040686869147 }, "harness|hellaswag|10": { "acc": 0.7155945030870344, "acc_stderr": 0.0045020882874701375, "acc_norm": 0.8910575582553276, "acc_norm_stderr": 0.0031093023001762094 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6444444444444445, "acc_stderr": 0.04135176749720385, "acc_norm": 0.6444444444444445, "acc_norm_stderr": 0.04135176749720385 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.03738520676119669, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.03738520676119669 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6981132075471698, "acc_stderr": 0.02825420034443866, "acc_norm": 0.6981132075471698, "acc_norm_stderr": 0.02825420034443866 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7777777777777778, "acc_stderr": 0.03476590104304134, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.03476590104304134 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.38235294117647056, "acc_stderr": 0.04835503696107224, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.04835503696107224 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.71, "acc_stderr": 0.04560480215720684, "acc_norm": 0.71, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5617021276595745, "acc_stderr": 0.03243618636108102, "acc_norm": 0.5617021276595745, "acc_norm_stderr": 0.03243618636108102 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.47368421052631576, "acc_stderr": 0.046970851366478626, "acc_norm": 0.47368421052631576, "acc_norm_stderr": 0.046970851366478626 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5655172413793104, "acc_stderr": 0.04130740879555498, "acc_norm": 0.5655172413793104, "acc_norm_stderr": 0.04130740879555498 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41534391534391535, "acc_stderr": 0.025379524910778394, "acc_norm": 0.41534391534391535, "acc_norm_stderr": 0.025379524910778394 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7870967741935484, "acc_stderr": 0.02328766512726855, "acc_norm": 0.7870967741935484, "acc_norm_stderr": 0.02328766512726855 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5320197044334976, "acc_stderr": 0.03510766597959215, "acc_norm": 0.5320197044334976, "acc_norm_stderr": 0.03510766597959215 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.032876667586034906, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.032876667586034906 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8080808080808081, "acc_stderr": 0.028057791672989017, "acc_norm": 0.8080808080808081, "acc_norm_stderr": 0.028057791672989017 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8911917098445595, "acc_stderr": 0.022473253332768763, "acc_norm": 0.8911917098445595, "acc_norm_stderr": 0.022473253332768763 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6666666666666666, "acc_stderr": 0.023901157979402538, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.023901157979402538 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.028742040903948485, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.028742040903948485 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6890756302521008, "acc_stderr": 0.03006676158297793, "acc_norm": 0.6890756302521008, "acc_norm_stderr": 0.03006676158297793 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.39072847682119205, "acc_stderr": 0.039837983066598075, "acc_norm": 0.39072847682119205, "acc_norm_stderr": 0.039837983066598075 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8385321100917431, "acc_stderr": 0.01577623925616323, "acc_norm": 0.8385321100917431, "acc_norm_stderr": 0.01577623925616323 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5185185185185185, "acc_stderr": 0.03407632093854051, "acc_norm": 0.5185185185185185, "acc_norm_stderr": 0.03407632093854051 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8480392156862745, "acc_stderr": 0.025195658428931792, "acc_norm": 0.8480392156862745, "acc_norm_stderr": 0.025195658428931792 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8059071729957806, "acc_stderr": 0.025744902532290916, "acc_norm": 0.8059071729957806, "acc_norm_stderr": 0.025744902532290916 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6860986547085202, "acc_stderr": 0.031146796482972465, "acc_norm": 0.6860986547085202, "acc_norm_stderr": 0.031146796482972465 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.8091603053435115, "acc_stderr": 0.03446513350752598, "acc_norm": 0.8091603053435115, "acc_norm_stderr": 0.03446513350752598 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7768595041322314, "acc_stderr": 0.03800754475228732, "acc_norm": 0.7768595041322314, "acc_norm_stderr": 0.03800754475228732 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7777777777777778, "acc_stderr": 0.0401910747255735, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.0401910747255735 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7607361963190185, "acc_stderr": 0.0335195387952127, "acc_norm": 0.7607361963190185, "acc_norm_stderr": 0.0335195387952127 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.44642857142857145, "acc_stderr": 0.04718471485219588, "acc_norm": 0.44642857142857145, "acc_norm_stderr": 0.04718471485219588 }, "harness|hendrycksTest-management|5": { "acc": 0.7669902912621359, "acc_stderr": 0.04185832598928315, "acc_norm": 0.7669902912621359, "acc_norm_stderr": 0.04185832598928315 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8846153846153846, "acc_stderr": 0.02093019318517933, "acc_norm": 0.8846153846153846, "acc_norm_stderr": 0.02093019318517933 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8250319284802043, "acc_stderr": 0.013586619219903341, "acc_norm": 0.8250319284802043, "acc_norm_stderr": 0.013586619219903341 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7341040462427746, "acc_stderr": 0.02378620325550829, "acc_norm": 0.7341040462427746, "acc_norm_stderr": 0.02378620325550829 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.4424581005586592, "acc_stderr": 0.016611393687268584, "acc_norm": 0.4424581005586592, "acc_norm_stderr": 0.016611393687268584 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7254901960784313, "acc_stderr": 0.025553169991826524, "acc_norm": 0.7254901960784313, "acc_norm_stderr": 0.025553169991826524 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7041800643086816, "acc_stderr": 0.025922371788818767, "acc_norm": 0.7041800643086816, "acc_norm_stderr": 0.025922371788818767 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7376543209876543, "acc_stderr": 0.024477222856135114, "acc_norm": 0.7376543209876543, "acc_norm_stderr": 0.024477222856135114 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.5070921985815603, "acc_stderr": 0.02982449855912901, "acc_norm": 0.5070921985815603, "acc_norm_stderr": 0.02982449855912901 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.47327249022164275, "acc_stderr": 0.01275197796767601, "acc_norm": 0.47327249022164275, "acc_norm_stderr": 0.01275197796767601 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6875, "acc_stderr": 0.02815637344037142, "acc_norm": 0.6875, "acc_norm_stderr": 0.02815637344037142 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6715686274509803, "acc_stderr": 0.018999707383162673, "acc_norm": 0.6715686274509803, "acc_norm_stderr": 0.018999707383162673 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6727272727272727, "acc_stderr": 0.0449429086625209, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.0449429086625209 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7346938775510204, "acc_stderr": 0.028263889943784593, "acc_norm": 0.7346938775510204, "acc_norm_stderr": 0.028263889943784593 }, "harness|hendrycksTest-sociology|5": { "acc": 0.835820895522388, "acc_stderr": 0.026193923544454125, "acc_norm": 0.835820895522388, "acc_norm_stderr": 0.026193923544454125 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.87, "acc_stderr": 0.03379976689896308, "acc_norm": 0.87, "acc_norm_stderr": 0.03379976689896308 }, "harness|hendrycksTest-virology|5": { "acc": 0.5662650602409639, "acc_stderr": 0.03858158940685516, "acc_norm": 0.5662650602409639, "acc_norm_stderr": 0.03858158940685516 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8538011695906432, "acc_stderr": 0.027097290118070806, "acc_norm": 0.8538011695906432, "acc_norm_stderr": 0.027097290118070806 }, "harness|truthfulqa:mc|0": { "mc1": 0.6291309669522643, "mc1_stderr": 0.016909693580248835, "mc2": 0.7748431207842124, "mc2_stderr": 0.013793813723651245 }, "harness|winogrande|5": { "acc": 0.8476716653512234, "acc_stderr": 0.010099208246065597 }, "harness|gsm8k|5": { "acc": 0.6921910538286581, "acc_stderr": 0.012714401009923647 } }



