open-llm-leaderboard/details_TheBloke__robin-13B-v2-fp16
收藏数据集概述
数据集简介
该数据集是在评估模型 TheBloke/robin-13B-v2-fp16 在 Open LLM Leaderboard 上的自动创建的。数据集包含 61 个配置,每个配置对应一个评估任务。
数据集结构
数据集由 1 次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_TheBloke__robin-13B-v2-fp16", "harness_truthfulqa_mc_0", split="train")
最新结果
这些是最新结果(来自 2023-07-31T15:48:06.598529 运行)的示例: python { "all": { "acc": 0.49056004249413854, "acc_stderr": 0.034895228964178376, "acc_norm": 0.49452555601900244, "acc_norm_stderr": 0.03487806793899599, "mc1": 0.34149326805385555, "mc1_stderr": 0.016600688619950826, "mc2": 0.5063100731922137, "mc2_stderr": 0.014760623429029368 }, "harness|arc:challenge|25": { "acc": 0.5401023890784983, "acc_stderr": 0.01456431885692485, "acc_norm": 0.5648464163822525, "acc_norm_stderr": 0.014487986197186045 }, "harness|hellaswag|10": { "acc": 0.5945030870344553, "acc_stderr": 0.004899845087183104, "acc_norm": 0.8037243576976698, "acc_norm_stderr": 0.003963677261161229 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4666666666666667, "acc_stderr": 0.043097329010363554, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.043097329010363554 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4868421052631579, "acc_stderr": 0.04067533136309173, "acc_norm": 0.4868421052631579, "acc_norm_stderr": 0.04067533136309173 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.4679245283018868, "acc_stderr": 0.03070948699255655, "acc_norm": 0.4679245283018868, "acc_norm_stderr": 0.03070948699255655 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.4722222222222222, "acc_stderr": 0.04174752578923185, "acc_norm": 0.4722222222222222, "acc_norm_stderr": 0.04174752578923185 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.24, "acc_stderr": 0.04292346959909284, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909284 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.38, "acc_stderr": 0.04878317312145632, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117317, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117317 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.44508670520231214, "acc_stderr": 0.03789401760283646, "acc_norm": 0.44508670520231214, "acc_norm_stderr": 0.03789401760283646 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.17647058823529413, "acc_stderr": 0.0379328118530781, "acc_norm": 0.17647058823529413, "acc_norm_stderr": 0.0379328118530781 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4, "acc_stderr": 0.03202563076101735, "acc_norm": 0.4, "acc_norm_stderr": 0.03202563076101735 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.30701754385964913, "acc_stderr": 0.04339138322579861, "acc_norm": 0.30701754385964913, "acc_norm_stderr": 0.04339138322579861 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4068965517241379, "acc_stderr": 0.04093793981266237, "acc_norm": 0.4068965517241379, "acc_norm_stderr": 0.04093793981266237 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.25925925925925924, "acc_stderr": 0.02256989707491841, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.02256989707491841 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.31746031746031744, "acc_norm_stderr": 0.04163453031302859 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.38, "acc_stderr": 0.04878317312145632, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.49032258064516127, "acc_stderr": 0.028438677998909558, "acc_norm": 0.49032258064516127, "acc_norm_stderr": 0.028438677998909558 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.32019704433497537, "acc_stderr": 0.032826493853041504, "acc_norm": 0.32019704433497537, "acc_norm_stderr": 0.032826493853041504 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6303030303030303, "acc_stderr": 0.037694303145125674, "acc_norm": 0.6303030303030303, "acc_norm_stderr": 0.037694303145125674 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5606060606060606, "acc_stderr": 0.03536085947529479, "acc_norm": 0.5606060606060606, "acc_norm_stderr": 0.03536085947529479 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.6683937823834197, "acc_stderr": 0.03397636541089118, "acc_norm": 0.6683937823834197, "acc_norm_stderr": 0.03397636541089118 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.44871794871794873, "acc_stderr": 0.025217315184846482, "acc_norm": 0.44871794871794873, "acc_norm_stderr": 0.025217315184846482 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.23333333333333334, "acc_stderr": 0.02578787422095932, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.02578787422095932 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.4411764705882353, "acc_stderr": 0.0322529423239964, "acc_norm": 0



