open-llm-leaderboard-old/details_Weyaxi__Einstein-v6.1-Llama3-8B
收藏数据集概述
数据集简介
该数据集是在评估模型Weyaxi/Einstein-v6.1-Llama3-8B在Open LLM Leaderboard上的运行过程中自动创建的。
数据集结构
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Weyaxi__Einstein-v6.1-Llama3-8B", "harness_winogrande_5", split="train")
最新结果
以下是2024-04-23T20:07:16.089913运行的最新结果:
python { "all": { "acc": 0.6622568916711685, "acc_stderr": 0.031895873805474845, "acc_norm": 0.6639870618123946, "acc_norm_stderr": 0.03254210825825137, "mc1": 0.39412484700122397, "mc1_stderr": 0.017106588140700325, "mc2": 0.5510255717182386, "mc2_stderr": 0.015093889908062723 }, "harness|arc:challenge|25": { "acc": 0.5921501706484642, "acc_stderr": 0.014361097288449701, "acc_norm": 0.6245733788395904, "acc_norm_stderr": 0.014150631435111728 }, "harness|hellaswag|10": { "acc": 0.6246763592909779, "acc_stderr": 0.004832167854501646, "acc_norm": 0.8241386178052181, "acc_norm_stderr": 0.0037992414085029633 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6518518518518519, "acc_stderr": 0.041153246103369526, "acc_norm": 0.6518518518518519, "acc_norm_stderr": 0.041153246103369526 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7105263157894737, "acc_stderr": 0.036906779861372814, "acc_norm": 0.7105263157894737, "acc_norm_stderr": 0.036906779861372814 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7320754716981132, "acc_stderr": 0.027257260322494845, "acc_norm": 0.7320754716981132, "acc_norm_stderr": 0.027257260322494845 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7916666666666666, "acc_stderr": 0.033961162058453336, "acc_norm": 0.7916666666666666, "acc_norm_stderr": 0.033961162058453336 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6242774566473989, "acc_stderr": 0.036928207672648664, "acc_norm": 0.6242774566473989, "acc_norm_stderr": 0.036928207672648664 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.46078431372549017, "acc_stderr": 0.049598599663841815, "acc_norm": 0.46078431372549017, "acc_norm_stderr": 0.049598599663841815 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.82, "acc_stderr": 0.03861229196653694, "acc_norm": 0.82, "acc_norm_stderr": 0.03861229196653694 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5617021276595745, "acc_stderr": 0.03243618636108102, "acc_norm": 0.5617021276595745, "acc_norm_stderr": 0.03243618636108102 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5789473684210527, "acc_stderr": 0.046446020912223177, "acc_norm": 0.5789473684210527, "acc_norm_stderr": 0.046446020912223177 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6413793103448275, "acc_stderr": 0.039966295748767186, "acc_norm": 0.6413793103448275, "acc_norm_stderr": 0.039966295748767186 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41798941798941797, "acc_stderr": 0.02540255550326091, "acc_norm": 0.41798941798941797, "acc_norm_stderr": 0.02540255550326091 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5158730158730159, "acc_stderr": 0.044698818540726076, "acc_norm": 0.5158730158730159, "acc_norm_stderr": 0.044698818540726076 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7838709677419354, "acc_stderr": 0.02341529343356853, "acc_norm": 0.7838709677419354, "acc_norm_stderr": 0.02341529343356853 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5467980295566502, "acc_stderr": 0.03502544650845872, "acc_norm": 0.5467980295566502, "acc_norm_stderr": 0.03502544650845872 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.803030303030303, "acc_stderr": 0.028335609732463362, "acc_norm": 0.803030303030303, "acc_norm_stderr": 0.028335609732463362 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8860103626943006, "acc_stderr": 0.022935144053919436, "acc_norm": 0.8860103626943006, "acc_norm_stderr": 0.022935144053919436 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6512820512820513, "acc_stderr": 0.02416278028401772, "acc_norm": 0.6512820512820513, "acc_norm_stderr": 0.02416278028401772 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.02944316932303154, "acc_norm": 0.370370370370



