open-llm-leaderboard-old/details_tourist800__mistral_2X7b
收藏数据集概述
数据集描述
该数据集是在评估模型tourist800/mistral_2X7b在Open LLM Leaderboard上的自动创建的。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割总是指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_tourist800__mistral_2X7b", "harness_winogrande_5", split="train")
最新结果
以下是2024-01-28T18:43:40.962065的最新结果: python { "all": { "acc": 0.6111388777526852, "acc_stderr": 0.03287383799644916, "acc_norm": 0.6159662135212005, "acc_norm_stderr": 0.03353760847602086, "mc1": 0.36107711138310894, "mc1_stderr": 0.016814312844836882, "mc2": 0.5207873423930568, "mc2_stderr": 0.0153889376471881 }, "harness|arc:challenge|25": { "acc": 0.5861774744027304, "acc_stderr": 0.014392730009221004, "acc_norm": 0.6339590443686007, "acc_norm_stderr": 0.014077223108470137 }, "harness|hellaswag|10": { "acc": 0.6421031666998606, "acc_stderr": 0.004784018497679814, "acc_norm": 0.8376817367058355, "acc_norm_stderr": 0.0036798891253998155 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.037385206761196686, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.037385206761196686 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.57, "acc_stderr": 0.04975698519562428, "acc_norm": 0.57, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6679245283018868, "acc_stderr": 0.02898545565233439, "acc_norm": 0.6679245283018868, "acc_norm_stderr": 0.02898545565233439 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7013888888888888, "acc_stderr": 0.03827052357950756, "acc_norm": 0.7013888888888888, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6184971098265896, "acc_stderr": 0.03703851193099521, "acc_norm": 0.6184971098265896, "acc_norm_stderr": 0.03703851193099521 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3627450980392157, "acc_stderr": 0.04784060704105653, "acc_norm": 0.3627450980392157, "acc_norm_stderr": 0.04784060704105653 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5063829787234042, "acc_stderr": 0.03268335899936337, "acc_norm": 0.5063829787234042, "acc_norm_stderr": 0.03268335899936337 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.40350877192982454, "acc_stderr": 0.046151869625837026, "acc_norm": 0.40350877192982454, "acc_norm_stderr": 0.046151869625837026 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6068965517241379, "acc_stderr": 0.040703290137070705, "acc_norm": 0.6068965517241379, "acc_norm_stderr": 0.040703290137070705 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4126984126984127, "acc_stderr": 0.025355741263055266, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.025355741263055266 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3968253968253968, "acc_stderr": 0.0437588849272706, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.0437588849272706 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5741935483870968, "acc_stderr": 0.028129112709165897, "acc_norm": 0.5741935483870968, "acc_norm_stderr": 0.028129112709165897 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586815, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586815 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8549222797927462, "acc_stderr": 0.025416343096306433, "acc_norm": 0.8549222797927462, "acc_norm_stderr": 0.025416343096306433 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5743589743589743, "acc_stderr": 0.025069094387296532, "acc_norm": 0.5743589743589743, "acc_norm_stderr": 0.025069094387296532 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3074074074074074, "acc_stderr": 0.028133252578815632, "acc_norm": 0.3074074074074074, "acc_norm_stderr": 0.028133252578



