open-llm-leaderboard-old/details_appvoid__palmer-002
收藏数据集概述
该数据集是在对模型 appvoid/palmer-002 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_appvoid__palmer-002", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-06T07:18:28.749206 运行的最新结果:
python { "all": { "acc": 0.2654825125280899, "acc_stderr": 0.031088201705180903, "acc_norm": 0.26647223357399397, "acc_norm_stderr": 0.03186346226006861, "mc1": 0.23011015911872704, "mc1_stderr": 0.014734557959807765, "mc2": 0.37064240232235823, "mc2_stderr": 0.014044445004895498 }, "harness|arc:challenge|25": { "acc": 0.3216723549488055, "acc_stderr": 0.013650488084494166, "acc_norm": 0.3447098976109215, "acc_norm_stderr": 0.01388881628678211 }, "harness|hellaswag|10": { "acc": 0.4509061939852619, "acc_stderr": 0.004965670398127355, "acc_norm": 0.5941047600079665, "acc_norm_stderr": 0.004900608529778596 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.26, "acc_stderr": 0.04408440022768081, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768081 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2518518518518518, "acc_stderr": 0.03749850709174021, "acc_norm": 0.2518518518518518, "acc_norm_stderr": 0.03749850709174021 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.17763157894736842, "acc_stderr": 0.03110318238312338, "acc_norm": 0.17763157894736842, "acc_norm_stderr": 0.03110318238312338 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.26, "acc_stderr": 0.04408440022768079, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2679245283018868, "acc_stderr": 0.027257260322494845, "acc_norm": 0.2679245283018868, "acc_norm_stderr": 0.027257260322494845 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2152777777777778, "acc_stderr": 0.03437079344106133, "acc_norm": 0.2152777777777778, "acc_norm_stderr": 0.03437079344106133 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.24, "acc_stderr": 0.04292346959909283, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2138728323699422, "acc_stderr": 0.03126511206173043, "acc_norm": 0.2138728323699422, "acc_norm_stderr": 0.03126511206173043 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.20588235294117646, "acc_stderr": 0.04023382273617749, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.04023382273617749 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.3191489361702128, "acc_stderr": 0.030472973363380045, "acc_norm": 0.3191489361702128, "acc_norm_stderr": 0.030472973363380045 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.21052631578947367, "acc_stderr": 0.038351539543994194, "acc_norm": 0.21052631578947367, "acc_norm_stderr": 0.038351539543994194 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2206896551724138, "acc_stderr": 0.03455930201924811, "acc_norm": 0.2206896551724138, "acc_norm_stderr": 0.03455930201924811 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2619047619047619, "acc_stderr": 0.022644212615525218, "acc_norm": 0.2619047619047619, "acc_norm_stderr": 0.022644212615525218 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.24603174603174602, "acc_stderr": 0.038522733649243156, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.038522733649243156 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.25483870967741934, "acc_stderr": 0.024790118459332208, "acc_norm": 0.25483870967741934, "acc_norm_stderr": 0.024790118459332208 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2561576354679803, "acc_stderr": 0.0307127300709826, "acc_norm": 0.2561576354679803, "acc_norm_stderr": 0.0307127300709826 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.2787878787878788, "acc_stderr": 0.03501438706296782, "acc_norm": 0.2787878787878788, "acc_norm_stderr": 0.03501438706296782 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.21717171717171718, "acc_stderr": 0.029376616484945637, "acc_norm": 0.21717171717171718, "acc_norm_stderr": 0.029376616484945637 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.21761658031088082, "acc_stderr": 0.029778663037752954, "acc_norm": 0.21761658031088082, "acc_norm_stderr": 0.029778663037752954 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2512820512820513, "acc_stderr": 0.021992016662370547, "acc_norm": 0.2512820512820513, "acc_norm_stderr": 0.021992016662370547 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.26296296296296295, "acc_stderr": 0.026842057873833706, "acc_norm": 0.26296296296296295, "acc_norm_stderr": 0.026842057873833706 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.25210084033613445, "acc_stderr": 0.028205545033277726, "acc_norm": 0.25210084033613445, "acc_norm_stderr": 0.028205545033277726 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.2251655629139073, "acc_stderr": 0.03410435282008936, "acc_norm": 0.2251655629139073, "acc_norm_stderr": 0.03410435282008936 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.23853211009174313, "acc_stderr": 0.01827257581023187, "acc_norm": 0.23853211009174313, "acc_norm_stderr": 0.01827257581023187 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.4583333333333333, "acc_stderr": 0.03398110890294636, "acc_norm": 0.4583333333333333, "acc_norm_stderr": 0.03398110890294636 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.22549019607843138, "acc_stderr": 0.02933116229425173, "acc_norm": 0.22549019607843138, "acc_norm_stderr": 0.02933116229425173 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.25316455696202533, "acc_stderr": 0.028304657943035286, "acc_norm": 0.25316455696202533, "acc_norm_stderr": 0.028304657943035286 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.37668161434977576, "acc_stderr": 0.032521134899291884, "acc_norm": 0.37668161434977576, "acc_norm_stderr": 0.032521134899291884 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.22900763358778625, "acc_stderr": 0.036853466317118506, "acc_norm": 0.22900763358778625, "acc_norm_stderr": 0.036853466317118506 }, "harness|hendrycksTest-international_law|5": { "acc": 0.2892561983471074, "acc_stderr": 0.041391127276354626, "acc_norm": 0.2892561983471074, "acc_norm_stderr": 0.041391127276354626 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.2777777777777778, "acc_stderr": 0.043300437496507437, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.043300437496507437 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.3128834355828221, "acc_stderr": 0.03642914578292404, "acc_norm": 0.3128834355828221, "acc_norm_stderr": 0.03642914578292404 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.26785714285714285, "acc_stderr": 0.04203277291467763, "acc_norm": 0.26785714285714285, "acc_norm_stderr": 0.04203277291467763 }, "harness|hendrycksTest-management|5": { "acc": 0.2524271844660194, "acc_stderr": 0.04301250399690877, "acc_norm": 0.2524271844660194, "acc_norm_stderr": 0.04301250399690877 }, "harness|hendrycksTest-marketing|5": { "acc": 0.2564102564102564, "acc_stderr": 0.028605953702004253, "acc_norm": 0.2564102564102564, "acc_norm_stderr": 0.028605953702004253 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.26, "acc_stderr": 0.044084400227680794, "acc_norm": 0.26, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.28735632183908044, "acc_stderr": 0.0161824107306827, "acc_norm": 0.28735632183908044, "acc_norm_stderr": 0.0161824107306827 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.23699421965317918, "acc_stderr": 0.022894082489925992, "acc_norm": 0.23699421965317918, "acc_norm_stderr": 0.022894082489925992 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.2424581005586592, "acc_stderr": 0.014333522059217889, "acc_norm": 0.2424581005586592, "acc_norm_stderr": 0.014333522059217889 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.24836601307189543, "acc_stderr": 0.02473998135511359, "acc_norm": 0.24836601307189543, "acc_norm_stderr": 0.02473998135511359 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.2508038585209003, "acc_stderr": 0.024619771956697165, "acc_norm": 0.2508038585209003, "acc_norm_stderr": 0.024619771956697165 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.2654320987654321, "acc_stderr": 0.024569223600460845, "acc_norm": 0.2654320987654321, "acc_norm_stderr": 0.024569223600460845 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.26595744680851063, "acc_stderr": 0.026358065698880592, "acc_norm": 0.26595744680851063, "acc_norm_stderr": 0.026358065698880592 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.23272490221642764, "acc_stderr": 0.010792595553888493, "acc_norm": 0.23272490221642764, "acc_norm_stderr": 0.010792595553888493 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.3235294117647059, "acc_stderr": 0.028418208619406794, "acc_norm": 0.3235294117647059, "acc_norm_stderr": 0.028418208619406794 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.25163398692810457, "acc_stderr": 0.017555818091322277, "acc_norm": 0.25163398692810457, "acc_norm_stderr": 0.017555818091322277 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.33636363636363636, "acc_stderr": 0.04525393596302505, "acc_norm": 0.33636363636363636, "acc_norm_stderr": 0.04525393596302505 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.1673469387755102, "acc_stderr": 0.023897144768914524, "acc_norm": 0.1673469387755102, "acc_norm_stderr": 0.023897144768914524 }, "harness|hendrycksTest-sociology|5": { "acc": 0.24378109452736318, "acc_stderr": 0.030360490154014652, "acc_norm": 0.24378109452736318, "acc_norm_stderr": 0.030360490154014652 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-virology|5": { "acc": 0.3192771084337349, "acc_stderr": 0.0362933532994786, "acc_norm": 0.3192771084337349, "acc_norm_stderr": 0.0362933532994786 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.21637426900584794, "acc_stderr": 0.03158149539338734, "acc_norm": 0.21637426900584794, "acc_norm_stderr": 0.03158149539338734 }, "harness|truthfulqa:mc|0": { "mc1": 0.23011015911872704, "mc1_stderr": 0.014734557959807765, "mc2": 0.37064240232235823, "mc2_stderr": 0.014044445004895498 }, "harness|winogrande|5": { "acc": 0.6266771902131019, "acc_stderr": 0.013594002763035526 }, "harness|gsm8k|5": { "acc": 0.012130401819560273, "acc_stderr": 0.0030152942428909504 } }



