open-llm-leaderboard-old/details_LeroyDyer__Mixtral_AI_Cyber_3.m2
收藏数据集概述
该数据集是在对模型 LeroyDyer/Mixtral_AI_Cyber_3.m2 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_LeroyDyer__Mixtral_AI_Cyber_3.m2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-30T18:17:30.935926 运行的最新结果:
python { "all": { "acc": 0.6410661049272328, "acc_stderr": 0.03236302400700874, "acc_norm": 0.6435005047677097, "acc_norm_stderr": 0.03301071023017795, "mc1": 0.4810281517747858, "mc1_stderr": 0.01749089640576235, "mc2": 0.6462359556185032, "mc2_stderr": 0.015151950873196571 }, "harness|arc:challenge|25": { "acc": 0.628839590443686, "acc_stderr": 0.014117971901142817, "acc_norm": 0.674061433447099, "acc_norm_stderr": 0.013697432466693246 }, "harness|hellaswag|10": { "acc": 0.6872137024497113, "acc_stderr": 0.004626805906522211, "acc_norm": 0.8687512447719578, "acc_norm_stderr": 0.0033698210047622503 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5703703703703704, "acc_stderr": 0.042763494943766, "acc_norm": 0.5703703703703704, "acc_norm_stderr": 0.042763494943766 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7236842105263158, "acc_stderr": 0.03639057569952928, "acc_norm": 0.7236842105263158, "acc_norm_stderr": 0.03639057569952928 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7245283018867924, "acc_stderr": 0.02749566368372406, "acc_norm": 0.7245283018867924, "acc_norm_stderr": 0.02749566368372406 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7777777777777778, "acc_stderr": 0.034765901043041336, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.034765901043041336 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6763005780346821, "acc_stderr": 0.035676037996391706, "acc_norm": 0.6763005780346821, "acc_norm_stderr": 0.035676037996391706 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.048971049527263666, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.048971049527263666 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.042295258468165065, "acc_norm": 0.77, "acc_norm_stderr": 0.042295258468165065 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.574468085106383, "acc_stderr": 0.03232146916224469, "acc_norm": 0.574468085106383, "acc_norm_stderr": 0.03232146916224469 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.47368421052631576, "acc_stderr": 0.04697085136647863, "acc_norm": 0.47368421052631576, "acc_norm_stderr": 0.04697085136647863 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5310344827586206, "acc_stderr": 0.04158632762097828, "acc_norm": 0.5310344827586206, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.40476190476190477, "acc_stderr": 0.0252798503974049, "acc_norm": 0.40476190476190477, "acc_norm_stderr": 0.0252798503974049 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3888888888888889, "acc_stderr": 0.04360314860077459, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.04360314860077459 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.04943110704237102, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7580645161290323, "acc_stderr": 0.024362599693031093, "acc_norm": 0.7580645161290323, "acc_norm_stderr": 0.024362599693031093 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586815, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586815 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8497409326424871, "acc_stderr": 0.02578772318072387, "acc_norm": 0.8497409326424871, "acc_norm_stderr": 0.02578772318072387 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6461538461538462, "acc_stderr": 0.024243783994062157, "acc_norm": 0.6461538461538462, "acc_norm_stderr": 0.024243783994062157 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.32592592592592595, "acc_stderr": 0.02857834836547308, "acc_norm": 0.325925925



