open-llm-leaderboard-old/details_deepseek-ai__deepseek-moe-16b-base
收藏数据集概述
该数据集是在对模型 deepseek-ai/deepseek-moe-16b-base 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- 每个配置中的 "train" 分割总是指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_deepseek-ai__deepseek-moe-16b-base", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果 的摘要:
python { "all": { "acc": 0.465522984657348, "acc_stderr": 0.034469796748715614, "acc_norm": 0.46990944729307677, "acc_norm_stderr": 0.03523647567293407, "mc1": 0.23745410036719705, "mc1_stderr": 0.014896277441041836, "mc2": 0.3607930335233562, "mc2_stderr": 0.01354653975819568 }, "harness|arc:challenge|25": { "acc": 0.49658703071672355, "acc_stderr": 0.014611050403244077, "acc_norm": 0.5324232081911263, "acc_norm_stderr": 0.014580637569995423 }, "harness|hellaswag|10": { "acc": 0.5957976498705437, "acc_stderr": 0.004897340793314379, "acc_norm": 0.7977494523003386, "acc_norm_stderr": 0.004008571431483689 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.27, "acc_stderr": 0.044619604333847415, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847415 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.3925925925925926, "acc_stderr": 0.04218506215368879, "acc_norm": 0.3925925925925926, "acc_norm_stderr": 0.04218506215368879 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4605263157894737, "acc_stderr": 0.04056242252249034, "acc_norm": 0.4605263157894737, "acc_norm_stderr": 0.04056242252249034 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.4716981132075472, "acc_stderr": 0.0307235352490061, "acc_norm": 0.4716981132075472, "acc_norm_stderr": 0.0307235352490061 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5347222222222222, "acc_stderr": 0.04171115858181618, "acc_norm": 0.5347222222222222, "acc_norm_stderr": 0.04171115858181618 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3930635838150289, "acc_stderr": 0.0372424959581773, "acc_norm": 0.3930635838150289, "acc_norm_stderr": 0.0372424959581773 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.27450980392156865, "acc_stderr": 0.04440521906179327, "acc_norm": 0.27450980392156865, "acc_norm_stderr": 0.04440521906179327 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.37446808510638296, "acc_stderr": 0.031639106653672915, "acc_norm": 0.37446808510638296, "acc_norm_stderr": 0.031639106653672915 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2719298245614035, "acc_stderr": 0.041857744240220554, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.041857744240220554 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4827586206896552, "acc_stderr": 0.04164188720169377, "acc_norm": 0.4827586206896552, "acc_norm_stderr": 0.04164188720169377 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.29365079365079366, "acc_stderr": 0.023456037383982022, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.023456037383982022 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303316, "acc_norm": 0.30952380952380953, "acc_norm_stderr": 0.04134913018303316 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.4870967741935484, "acc_stderr": 0.028434533152681855, "acc_norm": 0.4870967741935484, "acc_norm_stderr": 0.028434533152681855 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2955665024630542, "acc_stderr": 0.032104944337514575, "acc_norm": 0.2955665024630542, "acc_norm_stderr": 0.032104944337514575 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.5393939393939394, "acc_stderr": 0.03892207016552012, "acc_norm": 0.5393939393939394, "acc_norm_stderr": 0.03892207016552012 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5555555555555556, "acc_stderr": 0.03540294377095367, "acc_norm": 0.5555555555555556, "acc_norm_stderr": 0.03540294377095367 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.616580310880829, "acc_stderr": 0.03508984236295341, "acc_norm": 0.616580310880829, "acc_norm_stderr": 0.03508984236295341 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.41025641025641024, "acc_stderr": 0.02493931390694078, "acc_norm": 0.41025641025641024, "acc_norm_stderr": 0.02493931390694078 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2518518518518518, "acc_stderr": 0.026466117538959912, "acc_norm": 0.2518518518518518, "acc_norm_stderr": 0.026466117538959912 }, "harness|hendrycksTest-




