open-llm-leaderboard-old/details_CorticalStack__mistral-7b-slimorca-sft
收藏数据集概述
数据集摘要
该数据集是在评估模型CorticalStack/mistral-7b-slimorca-sft在Open LLM Leaderboard上的运行过程中自动创建的。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_CorticalStack__mistral-7b-slimorca-sft", "harness_winogrande_5", split="train")
最新结果
以下是最新结果从运行2024-02-16T15:13:14.245418: python { "all": { "acc": 0.6048142926851883, "acc_stderr": 0.032906396487846115, "acc_norm": 0.6105167673880798, "acc_norm_stderr": 0.03358845031709559, "mc1": 0.3402692778457772, "mc1_stderr": 0.016586304901762557, "mc2": 0.5018293862426123, "mc2_stderr": 0.014695173813842227 }, "harness|arc:challenge|25": { "acc": 0.552901023890785, "acc_stderr": 0.014529380160526843, "acc_norm": 0.5853242320819113, "acc_norm_stderr": 0.014397070564409174 }, "harness|hellaswag|10": { "acc": 0.6249751045608445, "acc_stderr": 0.004831399218500236, "acc_norm": 0.8316072495518821, "acc_norm_stderr": 0.003734498979207306 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526066, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526066 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5986842105263158, "acc_stderr": 0.03988903703336284, "acc_norm": 0.5986842105263158, "acc_norm_stderr": 0.03988903703336284 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.55, "acc_stderr": 0.05, "acc_norm": 0.55, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6754716981132075, "acc_stderr": 0.02881561571343211, "acc_norm": 0.6754716981132075, "acc_norm_stderr": 0.02881561571343211 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6527777777777778, "acc_stderr": 0.039812405437178615, "acc_norm": 0.6527777777777778, "acc_norm_stderr": 0.039812405437178615 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6069364161849711, "acc_stderr": 0.0372424959581773, "acc_norm": 0.6069364161849711, "acc_norm_stderr": 0.0372424959581773 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.35294117647058826, "acc_stderr": 0.04755129616062946, "acc_norm": 0.35294117647058826, "acc_norm_stderr": 0.04755129616062946 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.73, "acc_stderr": 0.044619604333847394, "acc_norm": 0.73, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5446808510638298, "acc_stderr": 0.03255525359340355, "acc_norm": 0.5446808510638298, "acc_norm_stderr": 0.03255525359340355 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4298245614035088, "acc_stderr": 0.04657047260594964, "acc_norm": 0.4298245614035088, "acc_norm_stderr": 0.04657047260594964 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5793103448275863, "acc_stderr": 0.0411391498118926, "acc_norm": 0.5793103448275863, "acc_norm_stderr": 0.0411391498118926 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4074074074074074, "acc_stderr": 0.02530590624159063, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.02530590624159063 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4126984126984127, "acc_stderr": 0.04403438954768176, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.04403438954768176 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7193548387096774, "acc_stderr": 0.02556060472102288, "acc_norm": 0.7193548387096774, "acc_norm_stderr": 0.02556060472102288 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4876847290640394, "acc_stderr": 0.035169204442208966, "acc_norm": 0.4876847290640394, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.63, "acc_stderr": 0.04852365870939099, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7454545454545455, "acc_stderr": 0.03401506715249039, "acc_norm": 0.7454545454545455, "acc_norm_stderr": 0.03401506715249039 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7626262626262627, "acc_stderr": 0.030313710538198892, "acc_norm": 0.7626262626262627, "acc_norm_stderr": 0.030313710538198892 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8393782383419689, "acc_stderr": 0.02649905770139746, "acc_norm": 0.8393782383419689, "acc_norm_stderr": 0.02649905770139746 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6, "acc_stderr": 0.024838811988033165, "acc_norm": 0.6, "acc_norm_stderr": 0.024838811988033165 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2777777777777778, "acc_stderr": 0.02730914058823018, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.02730914058823018 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6092436974789915,



