open-llm-leaderboard-old/details_DreadPoor__JustToSuffer-7B-slerp
收藏数据集概述
数据集简介
该数据集是在评估模型DreadPoor/JustToSuffer-7B-slerp在Open LLM Leaderboard上的运行过程中自动创建的。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建来源:从1次运行中创建。每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- 训练分割:"train"分割始终指向最新的结果。
- 结果配置:一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示在Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_DreadPoor__JustToSuffer-7B-slerp", "harness_winogrande_5", split="train")
最新结果
以下是2024-02-17T16:45:59.965925运行的最新结果:
python { "all": { "acc": 0.6495053299013857, "acc_stderr": 0.03213728048297641, "acc_norm": 0.6511170368490796, "acc_norm_stderr": 0.03278191957728323, "mc1": 0.4541003671970624, "mc1_stderr": 0.017429593091323522, "mc2": 0.6269022526171036, "mc2_stderr": 0.015516860373603584 }, "harness|arc:challenge|25": { "acc": 0.6604095563139932, "acc_stderr": 0.013839039762820167, "acc_norm": 0.689419795221843, "acc_norm_stderr": 0.013522292098053067 }, "harness|hellaswag|10": { "acc": 0.7030472017526389, "acc_stderr": 0.00455981758918207, "acc_norm": 0.8678550089623581, "acc_norm_stderr": 0.003379562298387565 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7039473684210527, "acc_stderr": 0.03715062154998904, "acc_norm": 0.7039473684210527, "acc_norm_stderr": 0.03715062154998904 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7132075471698113, "acc_stderr": 0.027834912527544067, "acc_norm": 0.7132075471698113, "acc_norm_stderr": 0.027834912527544067 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.035868792800803406, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.035868792800803406 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6820809248554913, "acc_stderr": 0.0355068398916558, "acc_norm": 0.6820809248554913, "acc_norm_stderr": 0.0355068398916558 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4215686274509804, "acc_stderr": 0.049135952012744975, "acc_norm": 0.4215686274509804, "acc_norm_stderr": 0.049135952012744975 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.04408440022768079, "acc_norm": 0.74, "acc_norm_stderr": 0.04408440022768079 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5787234042553191, "acc_stderr": 0.03227834510146268, "acc_norm": 0.5787234042553191, "acc_norm_stderr": 0.03227834510146268 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.04702880432049615, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5310344827586206, "acc_stderr": 0.04158632762097828, "acc_norm": 0.5310344827586206, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.40476190476190477, "acc_stderr": 0.025279850397404904, "acc_norm": 0.40476190476190477, "acc_norm_stderr": 0.025279850397404904 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42857142857142855, "acc_stderr": 0.0442626668137991, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.0442626668137991 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7903225806451613, "acc_stderr": 0.023157879349083525, "acc_norm": 0.7903225806451613, "acc_norm_stderr": 0.023157879349083525 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7757575757575758, "acc_stderr": 0.03256866661681102, "acc_norm": 0.7757575757575758, "acc_norm_stderr": 0.03256866661681102 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7929292929292929, "acc_stderr": 0.028869778460267042, "acc_norm": 0.7929292929292929, "acc_norm_stderr": 0.028869778460267042 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8911917098445595, "acc_stderr": 0.022473253332768776, "acc_norm": 0.8911917098445595, "acc_norm_stderr": 0.022473253332768776 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6692307692307692, "acc_stderr": 0.023854795680971125, "acc_norm": 0.6692307692307692, "acc_norm_stderr": 0.023854795680971125 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.337037037037037, "acc_stderr": 0.02882088466625326, "acc_norm": 0.33703703703703



