open-llm-leaderboard-old/details_VAGOsolutions__SauerkrautLM-7b-LaserChat
收藏数据集概述
数据集简介
该数据集是在评估模型 VAGOsolutions/SauerkrautLM-7b-LaserChat 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_VAGOsolutions__SauerkrautLM-7b-LaserChat", "harness_winogrande_5", split="train")
最新结果
这些是最新结果,来自 2024-02-09T16:19:16.787182 的运行: python { "all": { "acc": 0.6520471552592098, "acc_stderr": 0.0317831941394038, "acc_norm": 0.6529175499543937, "acc_norm_stderr": 0.03243149427349452, "mc1": 0.39657282741738065, "mc1_stderr": 0.017124930942023518, "mc2": 0.5608405966931661, "mc2_stderr": 0.015238807108954342 }, "harness|arc:challenge|25": { "acc": 0.6339590443686007, "acc_stderr": 0.014077223108470142, "acc_norm": 0.6757679180887372, "acc_norm_stderr": 0.013678810399518826 }, "harness|hellaswag|10": { "acc": 0.6329416450906195, "acc_stderr": 0.004810175357870934, "acc_norm": 0.8357896833300139, "acc_norm_stderr": 0.003697091837632076 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6444444444444445, "acc_stderr": 0.04135176749720385, "acc_norm": 0.6444444444444445, "acc_norm_stderr": 0.04135176749720385 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7105263157894737, "acc_stderr": 0.03690677986137283, "acc_norm": 0.7105263157894737, "acc_norm_stderr": 0.03690677986137283 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7018867924528301, "acc_stderr": 0.02815283794249386, "acc_norm": 0.7018867924528301, "acc_norm_stderr": 0.02815283794249386 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7638888888888888, "acc_stderr": 0.03551446610810826, "acc_norm": 0.7638888888888888, "acc_norm_stderr": 0.03551446610810826 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.57, "acc_stderr": 0.049756985195624284, "acc_norm": 0.57, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6994219653179191, "acc_stderr": 0.0349610148119118, "acc_norm": 0.6994219653179191, "acc_norm_stderr": 0.0349610148119118 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3431372549019608, "acc_stderr": 0.047240073523838876, "acc_norm": 0.3431372549019608, "acc_norm_stderr": 0.047240073523838876 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816505, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816505 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5872340425531914, "acc_stderr": 0.03218471141400351, "acc_norm": 0.5872340425531914, "acc_norm_stderr": 0.03218471141400351 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5, "acc_stderr": 0.047036043419179864, "acc_norm": 0.5, "acc_norm_stderr": 0.047036043419179864 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5862068965517241, "acc_stderr": 0.04104269211806232, "acc_norm": 0.5862068965517241, "acc_norm_stderr": 0.04104269211806232 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41798941798941797, "acc_stderr": 0.02540255550326091, "acc_norm": 0.41798941798941797, "acc_norm_stderr": 0.02540255550326091 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.46825396825396826, "acc_stderr": 0.04463112720677172, "acc_norm": 0.46825396825396826, "acc_norm_stderr": 0.04463112720677172 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7935483870967742, "acc_stderr": 0.023025899617188723, "acc_norm": 0.7935483870967742, "acc_norm_stderr": 0.023025899617188723 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4827586206896552, "acc_stderr": 0.035158955511656986, "acc_norm": 0.4827586206896552, "acc_norm_stderr": 0.035158955511656986 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.032876667586034906, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.032876667586034906 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7777777777777778, "acc_stderr": 0.02962022787479048, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.02962022787479048 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9015544041450777, "acc_stderr": 0.021500249576033467, "acc_norm": 0.9015544041450777, "acc_norm_stderr": 0.021500249576033467 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6717948717948717, "acc_stderr": 0.023807633198657266, "acc_norm": 0.6717948717948717, "acc_norm_stderr": 0.023807633198657266 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34444444444444444, "acc_stderr": 0.02897264888484427, "acc_norm": 0.34444444444444444, "acc_norm_stderr": 0.02897264888484427 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.680672268907563, "acc_stderr": 0.03



