open-llm-leaderboard-old/details_silvercoder45__Mistral-7b-instruct-v0.2-summ-dpo-e2
收藏数据集概述
数据集简介
该数据集是在模型silvercoder45/Mistral-7b-instruct-v0.2-summ-dpo-e2的评估运行期间自动创建的,用于Open LLM Leaderboard。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 数据来源:数据集从1次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train"分割始终指向最新的结果。
- 额外配置:"results"配置存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_silvercoder45__Mistral-7b-instruct-v0.2-summ-dpo-e2", "harness_winogrande_5", split="train")
最新结果
以下是最新结果: python { "all": { "acc": 0.6071506593443939, "acc_stderr": 0.0331634697242051, "acc_norm": 0.6115566365296805, "acc_norm_stderr": 0.0338383460849818, "mc1": 0.5630354957160343, "mc1_stderr": 0.017363844503195957, "mc2": 0.7053630317411392, "mc2_stderr": 0.015058893752819909 }, "harness|arc:challenge|25": { "acc": 0.590443686006826, "acc_stderr": 0.014370358632472434, "acc_norm": 0.6254266211604096, "acc_norm_stderr": 0.014144193471893456 }, "harness|hellaswag|10": { "acc": 0.6752638916550487, "acc_stderr": 0.004673191423861211, "acc_norm": 0.8530173272256523, "acc_norm_stderr": 0.003533649851728494 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6, "acc_stderr": 0.04232073695151589, "acc_norm": 0.6, "acc_norm_stderr": 0.04232073695151589 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.618421052631579, "acc_stderr": 0.03953173377749194, "acc_norm": 0.618421052631579, "acc_norm_stderr": 0.03953173377749194 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6830188679245283, "acc_stderr": 0.02863723563980089, "acc_norm": 0.6830188679245283, "acc_norm_stderr": 0.02863723563980089 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6875, "acc_stderr": 0.038760854559127644, "acc_norm": 0.6875, "acc_norm_stderr": 0.038760854559127644 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.04878317312145632, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5895953757225434, "acc_stderr": 0.037507570448955356, "acc_norm": 0.5895953757225434, "acc_norm_stderr": 0.037507570448955356 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726367, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726367 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5276595744680851, "acc_stderr": 0.03263597118409769, "acc_norm": 0.5276595744680851, "acc_norm_stderr": 0.03263597118409769 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.42105263157894735, "acc_stderr": 0.046446020912223177, "acc_norm": 0.42105263157894735, "acc_norm_stderr": 0.046446020912223177 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6275862068965518, "acc_stderr": 0.04028731532947558, "acc_norm": 0.6275862068965518, "acc_norm_stderr": 0.04028731532947558 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.38095238095238093, "acc_stderr": 0.025010749116137595, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.025010749116137595 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42857142857142855, "acc_stderr": 0.0442626668137991, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.0442626668137991 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6064516129032258, "acc_stderr": 0.027791878753132274, "acc_norm": 0.6064516129032258, "acc_norm_stderr": 0.027791878753132274 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5123152709359606, "acc_stderr": 0.035169204442208966, "acc_norm": 0.5123152709359606, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7393939393939394, "acc_stderr": 0.034277431758165236, "acc_norm": 0.7393939393939394, "acc_norm_stderr": 0.034277431758165236 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03053289223393202, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03053289223393202 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8549222797927462, "acc_stderr": 0.02541634309630644, "acc_norm": 0.8549222797927462, "acc_norm_stderr": 0.02541634309630644 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5769230769230769, "acc_stderr": 0.025049197876042345, "acc_norm": 0.5769230769230769, "acc_norm_stderr": 0.025049197876042345 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3074074074074074, "acc_stderr": 0.02813325257881563, "acc_norm": 0.3074074074074074, "acc_norm_stderr": 0.02813325257881563 }, "harness|hendrycksTest-



