open-llm-leaderboard/details_AA051610__VA
收藏数据集概述
该数据集是在模型 AA051610/VA 在 Open LLM Leaderboard 上的评估运行期间自动创建的。数据集包含 61 个配置,每个配置对应一个评估任务。
数据集结构
- 配置数量:61
- 数据来源:1 次运行
- 数据分割:每个配置包含特定分割,分割名称使用运行的时间戳。"train" 分割始终指向最新结果。
- 额外配置:"results" 存储所有运行结果的聚合,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_AA051610__VA", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是 2023-10-11T07:22:26.417131 运行的最新结果:
python { "all": { "acc": 0.4972405415581996, "acc_stderr": 0.03512578000813228, "acc_norm": 0.5002960487991649, "acc_norm_stderr": 0.03512615731416433, "mc1": 0.28518971848225216, "mc1_stderr": 0.015805827874454892, "mc2": 0.44928868954080875, "mc2_stderr": 0.014916546411376396 }, "harness|arc:challenge|25": { "acc": 0.3848122866894198, "acc_stderr": 0.014218371065251105, "acc_norm": 0.4138225255972696, "acc_norm_stderr": 0.014392730009221007 }, "harness|hellaswag|10": { "acc": 0.47390957976498704, "acc_stderr": 0.004982983592459198, "acc_norm": 0.6251742680740888, "acc_norm_stderr": 0.004830885704380092 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4444444444444444, "acc_stderr": 0.04292596718256981, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.04292596718256981 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.45394736842105265, "acc_stderr": 0.04051646342874142, "acc_norm": 0.45394736842105265, "acc_norm_stderr": 0.04051646342874142 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5547169811320755, "acc_stderr": 0.030588052974270658, "acc_norm": 0.5547169811320755, "acc_norm_stderr": 0.030588052974270658 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5069444444444444, "acc_stderr": 0.04180806750294938, "acc_norm": 0.5069444444444444, "acc_norm_stderr": 0.04180806750294938 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.24, "acc_stderr": 0.04292346959909284, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909284 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5144508670520231, "acc_stderr": 0.03810871630454764, "acc_norm": 0.5144508670520231, "acc_norm_stderr": 0.03810871630454764 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.18627450980392157, "acc_stderr": 0.03873958714149351, "acc_norm": 0.18627450980392157, "acc_norm_stderr": 0.03873958714149351 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.63, "acc_stderr": 0.048523658709390974, "acc_norm": 0.63, "acc_norm_stderr": 0.048523658709390974 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4553191489361702, "acc_stderr": 0.032555253593403555, "acc_norm": 0.4553191489361702, "acc_norm_stderr": 0.032555253593403555 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.34210526315789475, "acc_stderr": 0.04462917535336936, "acc_norm": 0.34210526315789475, "acc_norm_stderr": 0.04462917535336936 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5103448275862069, "acc_stderr": 0.04165774775728762, "acc_norm": 0.5103448275862069, "acc_norm_stderr": 0.04165774775728762 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.328042328042328, "acc_stderr": 0.024180497164376896, "acc_norm": 0.328042328042328, "acc_norm_stderr": 0.024180497164376896 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.2698412698412698, "acc_stderr": 0.03970158273235172, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.03970158273235172 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.567741935483871, "acc_stderr": 0.028181739720019416, "acc_norm": 0.567741935483871, "acc_norm_stderr": 0.028181739720019416 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.35960591133004927, "acc_stderr": 0.033764582465095665, "acc_norm": 0.35960591133004927, "acc_norm_stderr": 0.033764582465095665 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6121212121212121, "acc_stderr": 0.038049136539710114, "acc_norm": 0.6121212121212121, "acc_norm_stderr": 0.038049136539710114 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5858585858585859, "acc_stderr": 0.03509438348879629, "acc_norm": 0.5858585858585859, "acc_norm_stderr": 0.03509438348879629 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.6269430051813472, "acc_stderr": 0.03490205592048573, "acc_norm": 0.6269430051813472, "acc_norm_stderr": 0.03490205592048573 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.44871794871794873, "acc_stderr": 0.025217315184846475, "acc_norm": 0.44871794871794873, "acc_norm_stderr": 0.025217315184846475 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.26296296296296295, "acc_stderr": 0.02684205787383371, "acc_norm": 0.26296296296296295, "acc_norm_stderr":



