open-llm-leaderboard/details_varox34__minillm-7B-init-13B-sft
收藏数据集概述
数据集摘要
该数据集是在模型 varox34/minillm-7B-init-13B-sft 在 Open LLM Leaderboard 上的评估运行期间自动创建的。数据集由 63 个配置组成,每个配置对应一个评估任务。
数据集创建
数据集从 1 次运行中创建。每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
结果配置
一个额外的配置 "results" 存储所有聚合的运行结果(用于计算和显示 Open LLM Leaderboard 上的聚合指标)。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_varox34__minillm-7B-init-13B-sft", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-15T15:56:46.315091 运行的最新结果:
python { "all": { "acc": 0.35462956475919355, "acc_stderr": 0.03344663620513609, "acc_norm": 0.3571839751503999, "acc_norm_stderr": 0.034231531268842284, "mc1": 0.2141982864137087, "mc1_stderr": 0.01436214815569047, "mc2": 0.3356507077011393, "mc2_stderr": 0.014023095423451806 }, "harness|arc:challenge|25": { "acc": 0.49402730375426623, "acc_stderr": 0.014610348300255793, "acc_norm": 0.523037542662116, "acc_norm_stderr": 0.014595873205358262 }, "harness|hellaswag|10": { "acc": 0.5846444931288588, "acc_stderr": 0.004917761181740164, "acc_norm": 0.7779326827325234, "acc_norm_stderr": 0.004147867246653342 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.27, "acc_stderr": 0.044619604333847415, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847415 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.37777777777777777, "acc_stderr": 0.04188307537595853, "acc_norm": 0.37777777777777777, "acc_norm_stderr": 0.04188307537595853 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.29605263157894735, "acc_stderr": 0.03715062154998904, "acc_norm": 0.29605263157894735, "acc_norm_stderr": 0.03715062154998904 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.41132075471698115, "acc_stderr": 0.0302850092590098, "acc_norm": 0.41132075471698115, "acc_norm_stderr": 0.0302850092590098 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.3333333333333333, "acc_stderr": 0.039420826399272135, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.039420826399272135 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2254335260115607, "acc_stderr": 0.03186209851641144, "acc_norm": 0.2254335260115607, "acc_norm_stderr": 0.03186209851641144 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.24509803921568626, "acc_stderr": 0.042801058373643966, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.042801058373643966 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.49, "acc_stderr": 0.05024183937956911, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.35319148936170214, "acc_stderr": 0.03124532520276193, "acc_norm": 0.35319148936170214, "acc_norm_stderr": 0.03124532520276193 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.23684210526315788, "acc_stderr": 0.039994238792813365, "acc_norm": 0.23684210526315788, "acc_norm_stderr": 0.039994238792813365 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2827586206896552, "acc_stderr": 0.037528339580033376, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.037528339580033376 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.26455026455026454, "acc_stderr": 0.02271746789770862, "acc_norm": 0.26455026455026454, "acc_norm_stderr": 0.02271746789770862 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.21428571428571427, "acc_stderr": 0.03670066451047181, "acc_norm": 0.21428571428571427, "acc_norm_stderr": 0.03670066451047181 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.34516129032258064, "acc_stderr": 0.027045746573534327, "acc_norm": 0.34516129032258064, "acc_norm_stderr": 0.027045746573534327 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.26108374384236455, "acc_stderr": 0.030903796952114485, "acc_norm": 0.26108374384236455, "acc_norm_stderr": 0.030903796952114485 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.4666666666666667, "acc_stderr": 0.03895658065271847, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.03895658065271847 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.3686868686868687, "acc_stderr": 0.03437305501980619, "acc_norm": 0.3686868686868687, "acc_norm_stderr": 0.03437305501980619 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.48186528497409326, "acc_stderr": 0.03606065001832919, "acc_norm": 0.48186528497409326, "acc_norm_stderr": 0.03606065001832919 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.3230769230769231, "acc_stderr": 0.02371088850197057, "acc_norm": 0.3230769230769231, "acc_norm_stderr": 0.02371088850197057 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.23703703703703705, "acc_stderr": 0.025928



