open-llm-leaderboard-old/details_mayacinka__chatty-djinn-14B
收藏数据集概述
该数据集是在对模型mayacinka/chatty-djinn-14B进行评估运行时自动创建的,用于Open LLM Leaderboard。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 额外的"results"配置存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_mayacinka__chatty-djinn-14B", "harness_winogrande_5", split="train")
最新结果
以下是2024-02-29T19:49:56.199730运行的最新结果:
python { "all": { "acc": 0.6471791497818997, "acc_stderr": 0.032222361716232134, "acc_norm": 0.6487462376694461, "acc_norm_stderr": 0.03287960578340177, "mc1": 0.5006119951040392, "mc1_stderr": 0.017503487938892507, "mc2": 0.6757057112603054, "mc2_stderr": 0.015098383133095484 }, "harness|arc:challenge|25": { "acc": 0.6732081911262798, "acc_stderr": 0.013706665975587338, "acc_norm": 0.7039249146757679, "acc_norm_stderr": 0.013340916085246249 }, "harness|hellaswag|10": { "acc": 0.6602270464050985, "acc_stderr": 0.004726640532562037, "acc_norm": 0.8644692292372037, "acc_norm_stderr": 0.003415900722381885 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6222222222222222, "acc_stderr": 0.04188307537595852, "acc_norm": 0.6222222222222222, "acc_norm_stderr": 0.04188307537595852 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7105263157894737, "acc_stderr": 0.03690677986137283, "acc_norm": 0.7105263157894737, "acc_norm_stderr": 0.03690677986137283 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.65, "acc_stderr": 0.0479372485441102, "acc_norm": 0.65, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7018867924528301, "acc_stderr": 0.02815283794249387, "acc_norm": 0.7018867924528301, "acc_norm_stderr": 0.02815283794249387 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.035868792800803406, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.035868792800803406 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.55, "acc_stderr": 0.05, "acc_norm": 0.55, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6473988439306358, "acc_stderr": 0.036430371689585475, "acc_norm": 0.6473988439306358, "acc_norm_stderr": 0.036430371689585475 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.38235294117647056, "acc_stderr": 0.04835503696107223, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.04835503696107223 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.04408440022768078, "acc_norm": 0.74, "acc_norm_stderr": 0.04408440022768078 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5872340425531914, "acc_stderr": 0.03218471141400351, "acc_norm": 0.5872340425531914, "acc_norm_stderr": 0.03218471141400351 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4649122807017544, "acc_stderr": 0.046920083813689104, "acc_norm": 0.4649122807017544, "acc_norm_stderr": 0.046920083813689104 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5586206896551724, "acc_stderr": 0.04137931034482757, "acc_norm": 0.5586206896551724, "acc_norm_stderr": 0.04137931034482757 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41798941798941797, "acc_stderr": 0.025402555503260912, "acc_norm": 0.41798941798941797, "acc_norm_stderr": 0.025402555503260912 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.49206349206349204, "acc_stderr": 0.044715725362943486, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.044715725362943486 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7741935483870968, "acc_stderr": 0.023785577884181015, "acc_norm": 0.7741935483870968, "acc_norm_stderr": 0.023785577884181015 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.47783251231527096, "acc_stderr": 0.03514528562175008, "acc_norm": 0.47783251231527096, "acc_norm_stderr": 0.03514528562175008 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7636363636363637, "acc_stderr": 0.03317505930009182, "acc_norm": 0.7636363636363637, "acc_norm_stderr": 0.03317505930009182 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7929292929292929, "acc_stderr": 0.028869778460267045, "acc_norm": 0.7929292929292929, "acc_norm_stderr": 0.028869778460267045 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9015544041450777, "acc_stderr": 0.02150024957603348, "acc_norm": 0.9015544041450777, "acc_norm_stderr": 0.02150024957603348 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.023946724741563976, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.023946724741563976 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3592592592592593, "acc_stderr": 0.029252905927251972, "acc_norm": 0.3592592592592593, "acc_norm_stderr": 0.029252905927251972 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6974789915966386, "acc_stderr": 0.029837962388291932, "acc_norm": 0.6974789915966386, "acc_norm_stderr": 0.029837962388291932 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.37748344370860926, "acc_stderr": 0.0395802723112157, "acc_norm": 0.37748344370860926, "acc_norm_stderr": 0.0395802723112157 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8477064220183487, "acc_stderr": 0.015405084393157074, "acc_norm": 0.8477064220183487, "acc_norm_stderr": 0.015405084393157074 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5324074074074074, "acc_stderr": 0.03402801581358966, "acc_norm": 0.5324074074074074, "acc_norm_stderr": 0.03402801581358966 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8382352941176471, "acc_stderr": 0.025845017986926917, "acc_norm": 0.8382352941176471, "acc_norm_stderr": 0.025845017986926917 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.810126582278481, "acc_stderr": 0.02553010046023349, "acc_norm": 0.810126582278481, "acc_norm_stderr": 0.02553010046023349 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6860986547085202, "acc_stderr": 0.031146796482972465, "acc_norm": 0.6860986547085202, "acc_norm_stderr": 0.031146796482972465 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7786259541984732, "acc_stderr": 0.03641297081313729, "acc_norm": 0.7786259541984732, "acc_norm_stderr": 0.03641297081313729 }, "harness|hendrycksTest-international_law|5": { "acc": 0.768595041322314, "acc_stderr": 0.03849856098794088, "acc_norm": 0.768595041322314, "acc_norm_stderr": 0.03849856098794088 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7777777777777778, "acc_stderr": 0.0401910747255735, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.0401910747255735 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7607361963190185, "acc_stderr": 0.0335195387952127, "acc_norm": 0.7607361963190185, "acc_norm_stderr": 0.0335195387952127 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.45535714285714285, "acc_stderr": 0.047268355537191, "acc_norm": 0.45535714285714285, "acc_norm_stderr": 0.047268355537191 }, "harness|hendrycksTest-management|5": { "acc": 0.7766990291262136, "acc_stderr": 0.04123553189891431, "acc_norm": 0.7766990291262136, "acc_norm_stderr": 0.04123553189891431 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8803418803418803, "acc_stderr": 0.021262719400406957, "acc_norm": 0.8803418803418803, "acc_norm_stderr": 0.021262719400406957 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8212005108556832, "acc_stderr": 0.013702643715368982, "acc_norm": 0.8212005108556832, "acc_norm_stderr": 0.013702643715368982 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7109826589595376, "acc_stderr": 0.02440517393578323, "acc_norm": 0.7109826589595376, "acc_norm_stderr": 0.02440517393578323 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.4212290502793296, "acc_stderr": 0.016513676031179595, "acc_norm": 0.4212290502793296, "acc_norm_stderr": 0.016513676031179595 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7287581699346405, "acc_stderr": 0.025457756696667878, "acc_norm": 0.7287581699346405, "acc_norm_stderr": 0.025457756696667878 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7138263665594855, "acc_stderr": 0.02567025924218893, "acc_norm": 0.7138263665594855, "acc_norm_stderr": 0.02567025924218893 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7345679012345679, "acc_stderr": 0.024569223600460842, "acc_norm": 0.7345679012345679, "acc_norm_stderr": 0.024569223600460842 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.4929078014184397, "acc_stderr": 0.02982449855912901, "acc_norm": 0.4929078014184397, "acc_norm_stderr": 0.02982449855912901 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.4641460234680574, "acc_stderr": 0.012737361318730581, "acc_norm": 0.4641460234680574, "acc_norm_stderr": 0.012737361318730581 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6801470588235294, "acc_stderr": 0.028332959514031208, "acc_norm": 0.6801470588235294, "acc_norm_stderr": 0.028332959514031208 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6797385620915033, "acc_stderr": 0.018875682938069443, "acc_norm": 0.6797385620915033, "acc_norm_stderr": 0.018875682938069443 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6545454545454545, "acc_stderr": 0.04554619617541054, "acc_norm": 0.6545454545454545, "acc_norm_stderr": 0.04554619617541054 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7224489795918367, "acc_stderr": 0.028666857790274648, "acc_norm": 0.7224489795918367, "acc_norm_stderr": 0.028666857790274648 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8407960199004975, "acc_stderr": 0.025870646766169146, "acc_norm": 0.8407960199004975, "acc_norm_stderr": 0.025870646766169146 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.03487350880197771, "acc_norm": 0.86, "acc_norm_stderr": 0.03487350880197771 }, "harness|hendrycksTest-virology|5": { "acc": 0.536144578313253, "acc_stderr": 0.03882310850890594, "acc_norm": 0.536144578313253, "acc_norm_stderr": 0.03882310850890594 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8128654970760234, "acc_stderr": 0.02991312723236804, "acc_norm": 0.8128654970760234, "acc_norm_stderr": 0.02991312723236804 }, "harness|truthfulqa:mc|0": { "mc1": 0.5006119951040392, "mc1_stderr": 0.017503487938892507, "mc2": 0.6757057112603054, "mc2_stderr": 0.015098383133095484 }, "harness|winogrande|5": { "acc": 0.8310970797158642, "acc_stderr": 0.010529981411838911 }, "harness|gsm8k|5": { "acc": 0.6057619408642911, "acc_stderr": 0.013460852357095668 } }
配置详情
-
config_name: harness_arc_challenge_25
- 分割: 2024_02_29T19_49_56.199730
- 路径:
**/details_harness|arc:challenge|25_2024-02-29T19-49-56.199730.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|arc:challenge|25_2024-02-29T19-49-56.199730.parquet
- 路径:
- 分割: 2024_02_29T19_49_56.199730
-
config_name: harness_gsm8k_5
- 分割: 2024_02_29T19_49_56.199730
- 路径:
**/details_harness|gsm8k|5_2024-02-29T19-49-56.199730.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|gsm8k|5_2024-02-29T19-49-56.199730.parquet
- 路径:
- 分割: 2024_02_29T19_49_56.199730
-
config_name: harness_hellaswag_10
- 分割: 2024_02_29T19_49_56.199730
- 路径:
**/details_harness|hellaswag|10_2024-02-29T19-49-56.199730.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|hellaswag|10_2024-02-29T19-49-56.199730.parquet
- 路径:
- 分割: 2024_02_29T19_49_56.199730
-
config_name: harness_hendrycksTest_5
- 分割: 2024_02_29T19_49_56.199730
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-anatomy|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-astronomy|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-college_biology|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-college_physics|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-computer_security|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-econometrics|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-formal_logic|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-global_facts|5_2024-02-29T19-49-56.199730.parquet**/details_harness|hendrycksTest-high_school_biology|5_2024-02-29T19-49-56.199730.parquet- ...
- 路径:
- 分割: 2024_02_29T19_49_56.199730



