open-llm-leaderboard-old/details_Walmart-the-bag__openchat-3.5-Infinity
收藏数据集概述
该数据集是在对模型Walmart-the-bag/openchat-3.5-Infinity进行评估运行期间自动创建的,用于Open LLM Leaderboard。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。
- "train"拆分始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
加载数据示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Walmart-the-bag__openchat-3.5-Infinity", "harness_winogrande_5", split="train")
最新结果
以下是2023-12-29T22:24:05.513640的最新结果:
python { "all": { "acc": 0.6482331644164944, "acc_stderr": 0.032166482561296374, "acc_norm": 0.6494647991303745, "acc_norm_stderr": 0.03282046947130963, "mc1": 0.3525091799265606, "mc1_stderr": 0.016724646380756547, "mc2": 0.5198754304913998, "mc2_stderr": 0.015470093705054921 }, "harness|arc:challenge|25": { "acc": 0.5836177474402731, "acc_stderr": 0.014405618279436176, "acc_norm": 0.6262798634812287, "acc_norm_stderr": 0.01413770860175909 }, "harness|hellaswag|10": { "acc": 0.6629157538338977, "acc_stderr": 0.00471747833568963, "acc_norm": 0.8404700258912567, "acc_norm_stderr": 0.0036542123295166145 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6907894736842105, "acc_stderr": 0.037610708698674805, "acc_norm": 0.6907894736842105, "acc_norm_stderr": 0.037610708698674805 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6867924528301886, "acc_stderr": 0.028544793319055326, "acc_norm": 0.6867924528301886, "acc_norm_stderr": 0.028544793319055326 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7777777777777778, "acc_stderr": 0.03476590104304134, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.03476590104304134 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6647398843930635, "acc_stderr": 0.03599586301247077, "acc_norm": 0.6647398843930635, "acc_norm_stderr": 0.03599586301247077 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726366, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726366 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5872340425531914, "acc_stderr": 0.03218471141400351, "acc_norm": 0.5872340425531914, "acc_norm_stderr": 0.03218471141400351 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.49122807017543857, "acc_stderr": 0.047028804320496165, "acc_norm": 0.49122807017543857, "acc_norm_stderr": 0.047028804320496165 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5379310344827586, "acc_stderr": 0.04154659671707548, "acc_norm": 0.5379310344827586, "acc_norm_stderr": 0.04154659671707548 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4074074074074074, "acc_stderr": 0.025305906241590632, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.025305906241590632 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.49206349206349204, "acc_stderr": 0.044715725362943486, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.044715725362943486 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7806451612903226, "acc_stderr": 0.023540799358723306, "acc_norm": 0.7806451612903226, "acc_norm_stderr": 0.023540799358723306 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7757575757575758, "acc_stderr": 0.032568666616811015, "acc_norm": 0.7757575757575758, "acc_norm_stderr": 0.032568666616811015 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586815, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586815 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8860103626943006, "acc_stderr": 0.02293514405391945, "acc_norm": 0.8860103626943006, "acc_norm_stderr": 0.02293514405391945 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.02394672474156397, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.02394672474156397 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.02874204090394848, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.02874204090394848 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6890756302521008, "acc_stderr": 0.030066761582977934, "acc_norm": 0.6890756302521008, "acc_norm_stderr": 0.030066761582977934 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.3509933774834437, "acc_stderr": 0.03896981964257375, "acc_norm": 0.3509933774834437, "acc_norm_stderr": 0.03896981964257375 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8495412844036697, "acc_stderr": 0.015328563932669237, "acc_norm": 0.8495412844036697, "acc_norm_stderr": 0.015328563932669237 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5601851851851852, "acc_stderr": 0.0338517797604481, "acc_norm": 0.5601851851851852, "acc_norm_stderr": 0.0338517797604481 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8333333333333334, "acc_stderr": 0.026156867523931045, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.026156867523931045 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7974683544303798, "acc_stderr": 0.026160568246601446, "acc_norm": 0.7974683544303798, "acc_norm_stderr": 0.026160568246601446 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6995515695067265, "acc_stderr": 0.030769352008229143, "acc_norm": 0.6995515695067265, "acc_norm_stderr": 0.030769352008229143 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7786259541984732, "acc_stderr": 0.03641297081313729, "acc_norm": 0.7786259541984732, "acc_norm_stderr": 0.03641297081313729 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8016528925619835, "acc_stderr": 0.03640118271990947, "acc_norm": 0.8016528925619835, "acc_norm_stderr": 0.03640118271990947 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.8055555555555556, "acc_stderr": 0.038260763248848646, "acc_norm": 0.8055555555555556, "acc_norm_stderr": 0.038260763248848646 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7361963190184049, "acc_stderr": 0.034624199316156234, "acc_norm": 0.7361963190184049, "acc_norm_stderr": 0.034624199316156234 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.44642857142857145, "acc_stderr": 0.04718471485219588, "acc_norm": 0.44642857142857145, "acc_norm_stderr": 0.04718471485219588 }, "harness|hendrycksTest-management|5": { "acc": 0.8349514563106796, "acc_stderr": 0.036756688322331886, "acc_norm": 0.8349514563106796, "acc_norm_stderr": 0.036756688322331886 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8717948717948718, "acc_stderr": 0.02190190511507333, "acc_norm": 0.8717948717948718, "acc_norm_stderr": 0.02190190511507333 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.80970625798212, "acc_stderr": 0.014036945850381394, "acc_norm": 0.80970625798212, "acc_norm_stderr": 0.014036945850381394 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7283236994219653, "acc_stderr": 0.023948512905468348, "acc_norm": 0.7283236994219653, "acc_norm_stderr": 0.023948512905468348 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.37318435754189944, "acc_stderr": 0.016175692013381957, "acc_norm": 0.37318435754189944, "acc_norm_stderr": 0.016175692013381957 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7320261437908496, "acc_stderr": 0.025360603796242557, "acc_norm": 0.7320261437908496, "acc_norm_stderr": 0.025360603796242557 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7202572347266881, "acc_stderr": 0.02549425935069491, "acc_norm": 0.7202572347266881, "acc_norm_stderr": 0.02549425935069491 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7037037037037037, "acc_stderr": 0.02540719779889016, "acc_norm": 0.7037037037037037, "acc_norm_stderr": 0.02540719779889016 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.48936170212765956, "acc_stderr": 0.029820747191422466, "acc_norm": 0.48936170212765956, "acc_norm_stderr": 0.029820747191422466 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.47196870925684486, "acc_stderr": 0.01275015180292244, "acc_norm": 0.47196870925684486, "acc_norm_stderr": 0.01275015180292244 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.7242647058823529, "acc_stderr": 0.027146271936625166, "acc_norm": 0.7242647058823529, "acc_norm_stderr": 0.027146271936625166 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6519607843137255, "acc_stderr": 0.019270998708223977, "acc_norm": 0.6519607843137255, "acc_norm_stderr": 0.019270998708223977 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6818181818181818, "acc_stderr": 0.04461272175910509, "acc_norm": 0.6818181818181818, "acc_norm_stderr": 0.04461272175910509 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7061224489795919, "acc_stderr": 0.02916273841024977, "acc_norm": 0.7061224489795919, "acc_norm_stderr": 0.02916273841024977 }, "harness|hendrycksTest-sociology|5": { "acc": 0.835820895522388, "acc_stderr": 0.02619392354445412, "acc_norm": 0.835820895522388, "acc_norm_stderr": 0.02619392354445412 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.03487350880197771, "acc_norm": 0.86, "acc_norm_stderr": 0.03487350880197771 }, "harness|hendrycksTest-virology|5": { "acc": 0.5301204819277109, "acc_stderr": 0.03885425420866767, "acc_norm": 0.5301204819277109, "acc_norm_stderr": 0.03885425420866767 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8596491228070176, "acc_stderr": 0.026640582539133196, "acc_norm": 0.8596491228070176, "acc_norm_stderr": 0.026640582539133196 }, "harness|truthfulqa:mc|0": { "mc1": 0.3525091799265606, "mc1_stderr": 0.016724646380756547, "mc2": 0.5198754304913998, "mc2_stderr": 0.015470093705054921 }, "harness|winogrande|5": { "acc": 0.8011049723756906, "acc_stderr": 0.011218629972515328 }, "harness|gsm8k|5": { "acc": 0.6429112964366944, "acc_stderr": 0.013197931775445206 } }



