five

open-llm-leaderboard-old/details_rombodawg__Everyone-Coder-4x7b-Base

收藏
Hugging Face2024-01-15 更新2024-06-22 收录
下载链接:
https://hf-mirror.com/datasets/open-llm-leaderboard-old/details_rombodawg__Everyone-Coder-4x7b-Base
下载链接
链接失效反馈
官方服务:
资源简介:
该数据集是在评估模型rombodawg/Everyone-Coder-4x7b-Base时自动创建的。数据集由63个配置组成,每个配置对应一个评估任务。数据集由2次运行生成,每次运行的结果作为特定配置中的一个分割,分割名称使用运行的时间戳。train分割始终指向最新的结果。此外,还有一个名为results的配置,存储了所有运行的聚合结果,并用于在Open LLM Leaderboard上计算和显示聚合指标。

该数据集是在评估模型rombodawg/Everyone-Coder-4x7b-Base时自动创建的。数据集由63个配置组成,每个配置对应一个评估任务。数据集由2次运行生成,每次运行的结果作为特定配置中的一个分割,分割名称使用运行的时间戳。train分割始终指向最新的结果。此外,还有一个名为results的配置,存储了所有运行的聚合结果,并用于在Open LLM Leaderboard上计算和显示聚合指标。
提供机构:
open-llm-leaderboard-old
原始信息汇总

数据集概述

数据集组成

  • 该数据集包含63个配置,每个配置对应一个评估任务。
  • 数据集由2次运行结果组成,每次运行的结果可以在每个配置中找到,并以运行的时间戳命名。
  • "train"分割总是指向最新的结果。

额外配置

  • 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示在Open LLM Leaderboard上的聚合指标。

数据加载示例

python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_rombodawg__Everyone-Coder-4x7b-Base", "harness_winogrande_5", split="train")

最新结果

python { "all": { "acc": 0.6447898132540958, "acc_stderr": 0.031915985387073305, "acc_norm": 0.6461876134084575, "acc_norm_stderr": 0.03255592718009434, "mc1": 0.3390452876376989, "mc1_stderr": 0.016571797910626615, "mc2": 0.49160643723765735, "mc2_stderr": 0.015188709391608397 }, "harness|arc:challenge|25": { "acc": 0.6117747440273038, "acc_stderr": 0.014241614207414046, "acc_norm": 0.6450511945392492, "acc_norm_stderr": 0.013983036904094087 }, "harness|hellaswag|10": { "acc": 0.6623182632941645, "acc_stderr": 0.004719529099913131, "acc_norm": 0.8481378211511651, "acc_norm_stderr": 0.0035815378475817965 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6222222222222222, "acc_stderr": 0.04188307537595852, "acc_norm": 0.6222222222222222, "acc_norm_stderr": 0.04188307537595852 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6907894736842105, "acc_stderr": 0.037610708698674805, "acc_norm": 0.6907894736842105, "acc_norm_stderr": 0.037610708698674805 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6981132075471698, "acc_stderr": 0.02825420034443866, "acc_norm": 0.6981132075471698, "acc_norm_stderr": 0.02825420034443866 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7638888888888888, "acc_stderr": 0.03551446610810826, "acc_norm": 0.7638888888888888, "acc_norm_stderr": 0.03551446610810826 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956911, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6358381502890174, "acc_stderr": 0.03669072477416907, "acc_norm": 0.6358381502890174, "acc_norm_stderr": 0.03669072477416907 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.39215686274509803, "acc_stderr": 0.04858083574266345, "acc_norm": 0.39215686274509803, "acc_norm_stderr": 0.04858083574266345 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.78, "acc_stderr": 0.04163331998932263, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932263 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5531914893617021, "acc_stderr": 0.0325005368436584, "acc_norm": 0.5531914893617021, "acc_norm_stderr": 0.0325005368436584 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4473684210526316, "acc_stderr": 0.04677473004491199, "acc_norm": 0.4473684210526316, "acc_norm_stderr": 0.04677473004491199 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5517241379310345, "acc_stderr": 0.04144311810878152, "acc_norm": 0.5517241379310345, "acc_norm_stderr": 0.04144311810878152 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41534391534391535, "acc_stderr": 0.025379524910778408, "acc_norm": 0.41534391534391535, "acc_norm_stderr": 0.025379524910778408 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3888888888888889, "acc_stderr": 0.0436031486007746, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.0436031486007746 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7580645161290323, "acc_stderr": 0.024362599693031096, "acc_norm": 0.7580645161290323, "acc_norm_stderr": 0.024362599693031096 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5172413793103449, "acc_stderr": 0.035158955511656986, "acc_norm": 0.5172413793103449, "acc_norm_stderr": 0.035158955511656986 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.793939393939394, "acc_stderr": 0.031584153240477114, "acc_norm": 0.793939393939394, "acc_norm_stderr": 0.031584153240477114 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7828282828282829, "acc_stderr": 0.02937661648494563, "acc_norm": 0.7828282828282829, "acc_norm_stderr": 0.02937661648494563 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9015544041450777, "acc_stderr": 0.02150024957603346, "acc_norm": 0.9015544041450777, "acc_norm_stderr": 0.02150024957603346 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6461538461538462, "acc_stderr": 0.024243783994062153, "acc_norm": 0.6461538461538462, "acc_norm_stderr": 0.024243783994062153 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.36666666666666664, "acc_stderr": 0.029381620726465076, "acc_norm": 0.36666666666666664, "acc_norm_stderr": 0.029381620726465076 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.634453781512605, "acc_stderr": 0.031282177063684614, "acc_norm": 0.634453781512605, "acc_norm_stderr": 0.031282177063684614 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.31125827814569534, "acc_stderr": 0.03780445850526732, "acc_norm": 0.31125827814569534, "acc_norm_stderr": 0.03780445850526732 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8201834862385321, "acc_stderr": 0.01646534546739154, "acc_norm": 0.8201834862385321, "acc_norm_stderr": 0.01646534546739154 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5185185185185185, "acc_stderr": 0.03407632093854051, "acc_norm": 0.5185185185185185, "acc_norm_stderr": 0.03407632093854051 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.7794117647058824, "acc_stderr": 0.02910225438967407, "acc_norm": 0.7794117647058824, "acc_norm_stderr": 0.02910225438967407 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7974683544303798, "acc_stderr": 0.026160568246601446, "acc_norm": 0.7974683544303798, "acc_norm_stderr": 0.026160568246601446 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6905829596412556, "acc_stderr": 0.03102441174057222, "acc_norm": 0.6905829596412556, "acc_norm_stderr": 0.03102441174057222 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7938931297709924, "acc_stderr": 0.03547771004159463, "acc_norm": 0.7938931297709924, "acc_norm_stderr": 0.03547771004159463 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8181818181818182, "acc_stderr": 0.03520893951097652, "acc_norm": 0.8181818181818182, "acc_norm_stderr": 0.03520893951097652 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.8055555555555556, "acc_stderr": 0.038260763248848646, "acc_norm": 0.8055555555555556, "acc_norm_stderr": 0.038260763248848646 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7975460122699386, "acc_stderr": 0.031570650789119005, "acc_norm": 0.7975460122699386, "acc_norm_stderr": 0.031570650789119005 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.5, "acc_stderr": 0.04745789978762494, "acc_norm": 0.5, "acc_norm_stderr": 0.04745789978762494 }, "harness|hendrycksTest-management|5": { "acc": 0.8349514563106796, "acc_stderr": 0.036756688322331886, "acc_norm": 0.8349514563106796, "acc_norm_stderr": 0.036756688322331886 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8760683760683761, "acc_stderr": 0.021586494001281382, "acc_norm": 0.8760683760683761, "acc_norm_stderr": 0.021586494001281382 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.73, "acc_stderr": 0.044619604333847394, "acc_norm": 0.73, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8212005108556832, "acc_stderr": 0.013702643715368983, "acc_norm": 0.8212005108556832, "acc_norm_stderr": 0.013702643715368983 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7398843930635838, "acc_stderr": 0.023618678310069356, "acc_norm": 0.7398843930635838, "acc_norm_stderr": 0.023618678310069356 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.3318435754189944, "acc_stderr": 0.015748421208187303, "acc_norm": 0.3318435754189944, "acc_norm_stderr": 0.015748421208187303 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7352941176470589, "acc_stderr": 0.02526169121972948, "acc_norm": 0.7352941176470589, "acc_norm_stderr": 0.02526169121972948 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.707395498392283, "acc_stderr": 0.025839898334877983, "acc_norm": 0.707395498392283, "acc_norm_stderr": 0.025839898334877983 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7561728395061729, "acc_stderr": 0.023891879541959603, "acc_norm": 0.7561728395061729, "acc_norm_stderr": 0.023891879541959603 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.4858156028368794, "acc_stderr": 0.02981549448368206, "acc_norm": 0.4858156028368794, "acc_norm_stderr": 0.02981549448368206 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.45827900912646674, "acc_stderr": 0.01272570165695364, "acc_norm": 0.45827900912646674, "acc_norm_stderr": 0.01272570165695364 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6801470588235294, "acc_stderr": 0.02833295951403121, "acc_norm": 0.6801470588235294, "acc_norm_stderr": 0.02833295951403121 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.673202614379085, "acc_stderr": 0.01897542792050721, "acc_norm": 0.673202614379085, "acc_norm_stderr": 0.01897542792050721 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6454545454545455, "acc_stderr": 0.045820048415054174, "acc_norm": 0.6454545454545455, "acc_norm_stderr": 0.045820048415054174 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7551020408163265, "acc_stderr": 0.027529637440174923, "acc_norm": 0.7551020408163265, "acc_norm_stderr": 0.027529637440174923 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8258706467661692, "acc_stderr": 0.026814951200421603, "acc_norm": 0.8258706467661692, "acc_norm_stderr": 0.026814951200421603 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.91, "acc_stderr": 0.028762349126466136, "acc_norm": 0.91, "acc_norm_stderr": 0.028762349126466136 }, "harness|hendrycksTest-virology|5": { "acc": 0.5542168674698795, "acc_stderr": 0.03869543323472101, "acc_norm": 0.5542168674698795, "acc_norm_stderr": 0.03869543323472101 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.847953216374269, "acc_stderr": 0.027539122889061456, "acc_norm": 0.847953216374269, "acc_norm_stderr": 0.027539122889061456 }, "harness|truthfulqa:mc|0": { "mc1": 0.3390452876376989, "mc1_stderr": 0.016571797910626615, "mc2": 0.49160643723765735, "mc2_stderr": 0.015188709391608397 }, "harness|winogrande|5": { "acc": 0.7916337805840569, "acc_stderr": 0.011414554399987729 }, "harness|gsm8k|5": { "acc": 0.6345716451857468, "acc_stderr": 0.013264282030266635 } }

5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作