open-llm-leaderboard-old/details_rmihaylov__Llama-3-DARE-v1-8B
收藏数据集概述
数据集简介
该数据集是在评估模型 rmihaylov/Llama-3-DARE-v1-8B 的过程中自动创建的,用于 Open LLM Leaderboard。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
数据集由 1 次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
有一个额外的配置 "results",存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_rmihaylov__Llama-3-DARE-v1-8B", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-21T09:09:31.418184 运行的最新结果:
python { "all": { "acc": 0.6736172904789415, "acc_stderr": 0.03153313093055085, "acc_norm": 0.6764067930840016, "acc_norm_stderr": 0.03215797050538993, "mc1": 0.34394124847001223, "mc1_stderr": 0.01662908751427678, "mc2": 0.5132089831114146, "mc2_stderr": 0.01512529387479233 }, "harness|arc:challenge|25": { "acc": 0.5793515358361775, "acc_stderr": 0.014426211252508397, "acc_norm": 0.6220136518771331, "acc_norm_stderr": 0.0141696645203031 }, "harness|hellaswag|10": { "acc": 0.6048595897231627, "acc_stderr": 0.004878816961012044, "acc_norm": 0.7976498705437164, "acc_norm_stderr": 0.0040093078956771515 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6814814814814815, "acc_stderr": 0.040247784019771096, "acc_norm": 0.6814814814814815, "acc_norm_stderr": 0.040247784019771096 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7302631578947368, "acc_stderr": 0.03611780560284898, "acc_norm": 0.7302631578947368, "acc_norm_stderr": 0.03611780560284898 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.72, "acc_stderr": 0.045126085985421276, "acc_norm": 0.72, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7547169811320755, "acc_stderr": 0.026480357179895702, "acc_norm": 0.7547169811320755, "acc_norm_stderr": 0.026480357179895702 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7986111111111112, "acc_stderr": 0.033536474697138406, "acc_norm": 0.7986111111111112, "acc_norm_stderr": 0.033536474697138406 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6473988439306358, "acc_stderr": 0.036430371689585475, "acc_norm": 0.6473988439306358, "acc_norm_stderr": 0.036430371689585475 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.49019607843137253, "acc_stderr": 0.04974229460422817, "acc_norm": 0.49019607843137253, "acc_norm_stderr": 0.04974229460422817 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.79, "acc_stderr": 0.04093601807403326, "acc_norm": 0.79, "acc_norm_stderr": 0.04093601807403326 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5914893617021276, "acc_stderr": 0.032134180267015755, "acc_norm": 0.5914893617021276, "acc_norm_stderr": 0.032134180267015755 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5789473684210527, "acc_stderr": 0.046446020912223177, "acc_norm": 0.5789473684210527, "acc_norm_stderr": 0.046446020912223177 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6482758620689655, "acc_stderr": 0.0397923663749741, "acc_norm": 0.6482758620689655, "acc_norm_stderr": 0.0397923663749741 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.43915343915343913, "acc_stderr": 0.025559920550531, "acc_norm": 0.43915343915343913, "acc_norm_stderr": 0.025559920550531 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.48412698412698413, "acc_stderr": 0.04469881854072606, "acc_norm": 0.48412698412698413, "acc_norm_stderr": 0.04469881854072606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8, "acc_stderr": 0.022755204959542936, "acc_norm": 0.8, "acc_norm_stderr": 0.022755204959542936 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5320197044334976, "acc_stderr": 0.035107665979592154, "acc_norm": 0.5320197044334976, "acc_norm_stderr": 0.035107665979592154 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.74, "acc_stderr": 0.044084400227680794, "acc_norm": 0.74, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7515151515151515, "acc_stderr": 0.033744026441394036, "acc_norm": 0.7515151515151515, "acc_norm_stderr": 0.033744026441394036 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8282828282828283, "acc_stderr": 0.026869716187429903, "acc_norm": 0.8282828282828283, "acc_norm_stderr": 0.026869716187429903 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9067357512953368, "acc_stderr": 0.020986854593289733, "acc_norm": 0.9067357512953368, "acc_norm_stderr": 0.020986854593289733 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6820512820512821, "acc_stderr": 0.02361088430892786, "acc_norm": 0.6820512820512821, "acc_norm_stderr": 0.02361088430892786 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3925925925925926, "acc_stderr": 0.029773847012532967, "acc_norm": 0.3925925925925926, "acc_norm_stderr": 0.029773847012532967 }, "harness|hendrycksTest-high_school_micro



