open-llm-leaderboard-old/details_andysalerno__openchat-nectar-0.1
收藏数据集概述
该数据集是在对模型 andysalerno/openchat-nectar-0.1 进行评估运行时自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_andysalerno__openchat-nectar-0.1", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-11T09:55:59.577915 运行的最新结果:
python { "all": { "acc": 0.654396718006494, "acc_stderr": 0.03184182155109172, "acc_norm": 0.6549387923759522, "acc_norm_stderr": 0.03249770872652723, "mc1": 0.3769889840881273, "mc1_stderr": 0.01696551757893035, "mc2": 0.5421624590053248, "mc2_stderr": 0.015360430241150334 }, "harness|arc:challenge|25": { "acc": 0.6254266211604096, "acc_stderr": 0.014144193471893449, "acc_norm": 0.6621160409556314, "acc_norm_stderr": 0.01382204792228351 }, "harness|hellaswag|10": { "acc": 0.6329416450906195, "acc_stderr": 0.004810175357870936, "acc_norm": 0.8299143596893049, "acc_norm_stderr": 0.003749401775087307 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.03738520676119668, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.03738520676119668 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.67, "acc_stderr": 0.047258156262526094, "acc_norm": 0.67, "acc_norm_stderr": 0.047258156262526094 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7056603773584905, "acc_stderr": 0.028049186315695255, "acc_norm": 0.7056603773584905, "acc_norm_stderr": 0.028049186315695255 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.0358687928008034, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.0358687928008034 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.048783173121456344, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456344 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6936416184971098, "acc_stderr": 0.03514942551267438, "acc_norm": 0.6936416184971098, "acc_norm_stderr": 0.03514942551267438 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.37254901960784315, "acc_stderr": 0.04810840148082635, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.04810840148082635 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.73, "acc_stderr": 0.0446196043338474, "acc_norm": 0.73, "acc_norm_stderr": 0.0446196043338474 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6, "acc_stderr": 0.03202563076101735, "acc_norm": 0.6, "acc_norm_stderr": 0.03202563076101735 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.49122807017543857, "acc_stderr": 0.047028804320496165, "acc_norm": 0.49122807017543857, "acc_norm_stderr": 0.047028804320496165 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5655172413793104, "acc_stderr": 0.04130740879555498, "acc_norm": 0.5655172413793104, "acc_norm_stderr": 0.04130740879555498 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4444444444444444, "acc_stderr": 0.025591857761382182, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.025591857761382182 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.48412698412698413, "acc_stderr": 0.04469881854072606, "acc_norm": 0.48412698412698413, "acc_norm_stderr": 0.04469881854072606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.28, "acc_stderr": 0.04512608598542127, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8096774193548387, "acc_stderr": 0.022331707611823078, "acc_norm": 0.8096774193548387, "acc_norm_stderr": 0.022331707611823078 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.49261083743842365, "acc_stderr": 0.03517603540361008, "acc_norm": 0.49261083743842365, "acc_norm_stderr": 0.03517603540361008 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.031922715695483016, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.031922715695483016 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7929292929292929, "acc_stderr": 0.028869778460267042, "acc_norm": 0.7929292929292929, "acc_norm_stderr": 0.028869778460267042 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.02199531196364424, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.02199531196364424 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.023946724741563973, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.023946724741563973 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.029443169323031537, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.029443169323031537 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6848739495798319, "acc_stderr": 0.030176808288974337, "acc_norm": 0.6848739495798319, "acc_norm_stderr": 0.030176808288974337 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.3576158940397351, "acc_stderr": 0.03913453431177258, "acc_norm": 0.3576158940397351, "acc_norm_stderr": 0.03913453431177258 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8513761467889909, "acc_stderr": 0.015251253773660834, "acc_norm": 0.8513761467889909, "acc_norm_stderr": 0.015251253773660834 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5185185185185185, "acc_stderr": 0.0340763209385405, "acc_norm": 0.5185185185185185, "acc_norm_stderr": 0.0340763209385405 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8333333333333334, "acc_stderr": 0.02615686752393104, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.02615686752393104 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8143459915611815, "acc_stderr": 0.025310495376944863, "acc_norm": 0.8143459915611815, "acc_norm_stderr": 0.025310495376944863 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.7040358744394619, "acc_stderr": 0.030636591348699813, "acc_norm": 0.7040358744394619, "acc_norm_stderr": 0.030636591348699813 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7557251908396947, "acc_stderr": 0.03768335959728745, "acc_norm": 0.7557251908396947, "acc_norm_stderr": 0.03768335959728745 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8181818181818182, "acc_stderr": 0.03520893951097653, "acc_norm": 0.8181818181818182, "acc_norm_stderr": 0.03520893951097653 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7592592592592593, "acc_stderr": 0.04133119440243839, "acc_norm": 0.7592592592592593, "acc_norm_stderr": 0.04133119440243839 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7730061349693251, "acc_stderr": 0.03291099578615769, "acc_norm": 0.7730061349693251, "acc_norm_stderr": 0.03291099578615769 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.4642857142857143, "acc_stderr": 0.04733667890053756, "acc_norm": 0.4642857142857143, "acc_norm_stderr": 0.04733667890053756 }, "harness|hendrycksTest-management|5": { "acc": 0.8155339805825242, "acc_stderr": 0.03840423627288276, "acc_norm": 0.8155339805825242, "acc_norm_stderr": 0.03840423627288276 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8974358974358975, "acc_stderr": 0.01987565502786744, "acc_norm": 0.8974358974358975, "acc_norm_stderr": 0.01987565502786744 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.79, "acc_stderr": 0.04093601807403326, "acc_norm": 0.79, "acc_norm_stderr": 0.04093601807403326 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8326947637292464, "acc_stderr": 0.013347327202920332, "acc_norm": 0.8326947637292464, "acc_norm_stderr": 0.013347327202920332 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7543352601156069, "acc_stderr": 0.023176298203992, "acc_norm": 0.7543352601156069, "acc_norm_stderr": 0.023176298203992 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.2636871508379888, "acc_stderr": 0.014736926383761976, "acc_norm": 0.2636871508379888, "acc_norm_stderr": 0.014736926383761976 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.738562091503268, "acc_stderr": 0.025160998214292456, "acc_norm": 0.738562091503268, "acc_norm_stderr": 0.025160998214292456 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7138263665594855, "acc_stderr": 0.025670259242188936, "acc_norm": 0.7138263665594855, "acc_norm_stderr": 0.025670259242188936 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7777777777777778, "acc_stderr": 0.023132376234543332, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.023132376234543332 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.46808510638297873, "acc_stderr": 0.029766675075873866, "acc_norm": 0.46808510638297873, "acc_norm_stderr": 0.029766675075873866 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.48565840938722293, "acc_stderr": 0.012764981829524269, "acc_norm": 0.48565840938722293, "acc_norm_stderr": 0.012764981829524269 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.7352941176470589, "acc_stderr": 0.02679956202488766, "acc_norm": 0.7352941176470589, "acc_norm_stderr": 0.02679956202488766 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6781045751633987, "acc_stderr": 0.01890101532209309, "acc_norm": 0.6781045751633987, "acc_norm_stderr": 0.01890101532209309 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6636363636363637, "acc_stderr": 0.04525393596302505, "acc_norm": 0.6636363636363637, "acc_norm_stderr": 0.04525393596302505 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7551020408163265, "acc_stderr": 0.027529637440174937, "acc_norm": 0.7551020408163265, "acc_norm_stderr": 0.027529637440174937 }, "harness|hendrycksTest-sociology|5": { "acc": 0.835820895522388, "acc_stderr": 0.02619392354445412, "acc_norm": 0.835820895522388, "acc_norm_stderr": 0.02619392354445412 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.03487350880197768, "acc_norm": 0.86, "acc_norm_stderr": 0.03487350880197768 }, "harness|hendrycksTest-virology|5": { "acc": 0.5180722891566265, "acc_stderr": 0.03889951252827216, "acc_norm": 0.5180722891566265, "acc_norm_stderr": 0.03889951252827216 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8187134502923976, "acc_stderr": 0.029547741687640038, "acc_norm": 0.8187134502923976, "acc_norm_stderr": 0.029547741687640038 }, "harness|truthfulqa:mc|0": { "mc1": 0.3769889840881273, "mc1_stderr": 0.01696551757893035, "mc2": 0.5421624590053248, "mc2_stderr": 0.015360430241150334 }, "harness|winogrande|5": { "acc": 0.813733228097869, "acc_stderr": 0.01094187795567621 }, "harness|gsm8k|5": { "acc": 0.6967399545109931, "acc_stderr": 0.012661502663418697 } }



