open-llm-leaderboard-old/details_l3utterfly__minima-3b-layla-v2
收藏数据集概述
数据集摘要
该数据集是在评估模型 l3utterfly/minima-3b-layla-v2 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 数据集由 63 个配置组成,每个配置对应一个评估任务。
- 数据集是从 1 次运行中创建的。每个运行可以在每个配置中作为一个特定的分片找到,分片名称使用运行的时间戳。
- "train" 分片始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果(用于计算和显示 Open LLM Leaderboard 上的聚合指标)。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_l3utterfly__minima-3b-layla-v2", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2023-12-24T14:17:53.842048 的运行: python { "all": { "acc": 0.29413179589868294, "acc_stderr": 0.032120734757445236, "acc_norm": 0.2950139868612686, "acc_norm_stderr": 0.03284694916274657, "mc1": 0.26805385556915545, "mc1_stderr": 0.015506204722834559, "mc2": 0.43635958393568464, "mc2_stderr": 0.014541344314280011 }, "harness|arc:challenge|25": { "acc": 0.4138225255972696, "acc_stderr": 0.014392730009221007, "acc_norm": 0.44197952218430037, "acc_norm_stderr": 0.014512682523128345 }, "harness|hellaswag|10": { "acc": 0.522903804023103, "acc_stderr": 0.004984543540932335, "acc_norm": 0.6992630950009958, "acc_norm_stderr": 0.0045764127139515 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2, "acc_stderr": 0.034554737023254366, "acc_norm": 0.2, "acc_norm_stderr": 0.034554737023254366 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.26973684210526316, "acc_stderr": 0.03611780560284898, "acc_norm": 0.26973684210526316, "acc_norm_stderr": 0.03611780560284898 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2792452830188679, "acc_stderr": 0.027611163402399715, "acc_norm": 0.2792452830188679, "acc_norm_stderr": 0.027611163402399715 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2777777777777778, "acc_stderr": 0.03745554791462457, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03745554791462457 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932269 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.23699421965317918, "acc_stderr": 0.03242414757483098, "acc_norm": 0.23699421965317918, "acc_norm_stderr": 0.03242414757483098 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.2549019607843137, "acc_stderr": 0.043364327079931785, "acc_norm": 0.2549019607843137, "acc_norm_stderr": 0.043364327079931785 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.2680851063829787, "acc_stderr": 0.028957342788342343, "acc_norm": 0.2680851063829787, "acc_norm_stderr": 0.028957342788342343 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.20175438596491227, "acc_stderr": 0.037752050135836386, "acc_norm": 0.20175438596491227, "acc_norm_stderr": 0.037752050135836386 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.35172413793103446, "acc_stderr": 0.03979236637497411, "acc_norm": 0.35172413793103446, "acc_norm_stderr": 0.03979236637497411 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.23809523809523808, "acc_stderr": 0.021935878081184763, "acc_norm": 0.23809523809523808, "acc_norm_stderr": 0.021935878081184763 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.2857142857142857, "acc_stderr": 0.0404061017820884, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.0404061017820884 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.2870967741935484, "acc_stderr": 0.025736542745594528, "acc_norm": 0.2870967741935484, "acc_norm_stderr": 0.025736542745594528 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.17733990147783252, "acc_stderr": 0.02687433727680835, "acc_norm": 0.17733990147783252, "acc_norm_stderr": 0.02687433727680835 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.27, "acc_stderr": 0.0446196043338474, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.296969696969697, "acc_stderr": 0.035679697722680474, "acc_norm": 0.296969696969697, "acc_norm_stderr": 0.035679697722680474 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.25252525252525254, "acc_stderr": 0.030954055470365897, "acc_norm": 0.25252525252525254, "acc_norm_stderr": 0.030954055470365897 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.27979274611398963, "acc_stderr": 0.03239637046735704, "acc_norm": 0.27979274611398963, "acc_norm_stderr": 0.03239637046735704 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2923076923076923, "acc_stderr": 0.02306043838085774, "acc_norm": 0.2923076923076923, "acc_norm_stderr": 0.02306043838085774 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.24074074074074073, "acc_stderr": 0.02606715922227578, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.02606715922227578 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.28991596638



