open-llm-leaderboard-old/details_CausalLM__34b-beta
收藏数据集概述
数据集名称
Evaluation run of CausalLM/34b-beta
数据集描述
该数据集是在模型 CausalLM/34b-beta 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 数据集由 63 个配置组成,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_CausalLM__34b-beta", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-10T01:35:49.727207 运行的最新结果:
python { "all": { "acc": 0.8441348354388523, "acc_stderr": 0.02379515832444238, "acc_norm": 0.8532367075940402, "acc_norm_stderr": 0.024157515284528485, "mc1": 0.4039167686658507, "mc1_stderr": 0.01717727682258428, "mc2": 0.5837785963295662, "mc2_stderr": 0.01545899436626738 }, "harness|arc:challenge|25": { "acc": 0.659556313993174, "acc_stderr": 0.013847460518892973, "acc_norm": 0.7056313993174061, "acc_norm_stderr": 0.013318528460539422 }, "harness|hellaswag|10": { "acc": 0.6440948018323043, "acc_stderr": 0.004778081784542404, "acc_norm": 0.8419637522405895, "acc_norm_stderr": 0.0036402949128386845 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.8666666666666667, "acc_stderr": 0.029365879728106857, "acc_norm": 0.8666666666666667, "acc_norm_stderr": 0.029365879728106857 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.9013157894736842, "acc_stderr": 0.02427022773752272, "acc_norm": 0.9013157894736842, "acc_norm_stderr": 0.02427022773752272 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.84, "acc_stderr": 0.03684529491774708, "acc_norm": 0.84, "acc_norm_stderr": 0.03684529491774708 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.8981132075471698, "acc_stderr": 0.01861754975827668, "acc_norm": 0.8981132075471698, "acc_norm_stderr": 0.01861754975827668 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.9791666666666666, "acc_stderr": 0.01194372163115358, "acc_norm": 0.9791666666666666, "acc_norm_stderr": 0.01194372163115358 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.8, "acc_stderr": 0.040201512610368445, "acc_norm": 0.8, "acc_norm_stderr": 0.040201512610368445 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.838150289017341, "acc_stderr": 0.02808359427957575, "acc_norm": 0.838150289017341, "acc_norm_stderr": 0.02808359427957575 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.6568627450980392, "acc_stderr": 0.04724007352383889, "acc_norm": 0.6568627450980392, "acc_norm_stderr": 0.04724007352383889 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.88, "acc_stderr": 0.032659863237109066, "acc_norm": 0.88, "acc_norm_stderr": 0.032659863237109066 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.8893617021276595, "acc_stderr": 0.02050614509900843, "acc_norm": 0.8893617021276595, "acc_norm_stderr": 0.02050614509900843 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.7017543859649122, "acc_stderr": 0.04303684033537317, "acc_norm": 0.7017543859649122, "acc_norm_stderr": 0.04303684033537317 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.8758620689655172, "acc_stderr": 0.0274782369836366, "acc_norm": 0.8758620689655172, "acc_norm_stderr": 0.0274782369836366 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.8412698412698413, "acc_stderr": 0.01882030729513838, "acc_norm": 0.8412698412698413, "acc_norm_stderr": 0.01882030729513838 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.6428571428571429, "acc_stderr": 0.04285714285714281, "acc_norm": 0.6428571428571429, "acc_norm_stderr": 0.04285714285714281 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.68, "acc_stderr": 0.046882617226215034, "acc_norm": 0.68, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.9451612903225807, "acc_stderr": 0.012951418509899199, "acc_norm": 0.9451612903225807, "acc_norm_stderr": 0.012951418509899199 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.8177339901477833, "acc_stderr": 0.02716334085964515, "acc_norm": 0.8177339901477833, "acc_norm_stderr": 0.02716334085964515 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.9, "acc_stderr": 0.030151134457776348, "acc_norm": 0.9, "acc_norm_stderr": 0.030151134457776348 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.9393939393939394, "acc_stderr": 0.01863202167916562, "acc_norm": 0.9393939393939394, "acc_norm_stderr": 0.01863202167916562 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9595959595959596, "acc_stderr": 0.014028895836494496, "acc_norm": 0.9595959595959596, "acc_norm_stderr": 0.014028895836494496 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9844559585492227, "acc_stderr": 0.008927492715084346, "acc_norm": 0.9844559585492227, "acc_norm_stderr": 0.008927492715084346 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.8871794871794871, "acc_stderr": 0.01604076143845816, "acc_norm": 0.8871794871794871, "acc_norm_stderr": 0.01604076143845816 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.7111111111111111, "acc_stderr": 0.027634907264178544, "acc_norm": 0.7111111111111111, "acc_norm_stderr": 0.027



