open-llm-leaderboard-old/details_Josephgflowers__3BigReasonCinder
收藏数据集概述
该数据集是在评估模型 Josephgflowers/3BigReasonCinder 在 Open LLM Leaderboard 上的运行过程中自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
数据集从 1 次运行中创建,每个运行可以在每个配置中找到一个特定的分割,分割名称使用运行的时间戳。"train" 分割总是指向最新的结果。
此外,一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Josephgflowers__3BigReasonCinder", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-09T12:31:38.090504 运行的最新结果:
python { "all": { "acc": 0.44794266994933396, "acc_stderr": 0.03464503770381712, "acc_norm": 0.4507932379135084, "acc_norm_stderr": 0.03538213666564797, "mc1": 0.2876376988984088, "mc1_stderr": 0.015846315101394816, "mc2": 0.44764087589469737, "mc2_stderr": 0.014703779857331185 }, "harness|arc:challenge|25": { "acc": 0.39078498293515357, "acc_stderr": 0.014258563880513778, "acc_norm": 0.41723549488054607, "acc_norm_stderr": 0.014409825518403082 }, "harness|hellaswag|10": { "acc": 0.4801832304321848, "acc_stderr": 0.004985860853427632, "acc_norm": 0.6515634335789683, "acc_norm_stderr": 0.004755013243022131 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4222222222222222, "acc_stderr": 0.04266763404099582, "acc_norm": 0.4222222222222222, "acc_norm_stderr": 0.04266763404099582 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4342105263157895, "acc_stderr": 0.040335656678483184, "acc_norm": 0.4342105263157895, "acc_norm_stderr": 0.040335656678483184 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5094339622641509, "acc_stderr": 0.030767394707808093, "acc_norm": 0.5094339622641509, "acc_norm_stderr": 0.030767394707808093 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.4722222222222222, "acc_stderr": 0.04174752578923185, "acc_norm": 0.4722222222222222, "acc_norm_stderr": 0.04174752578923185 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.31, "acc_stderr": 0.046482319871173156, "acc_norm": 0.31, "acc_norm_stderr": 0.046482319871173156 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3930635838150289, "acc_stderr": 0.03724249595817729, "acc_norm": 0.3930635838150289, "acc_norm_stderr": 0.03724249595817729 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237654, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.37446808510638296, "acc_stderr": 0.03163910665367291, "acc_norm": 0.37446808510638296, "acc_norm_stderr": 0.03163910665367291 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.23684210526315788, "acc_stderr": 0.039994238792813344, "acc_norm": 0.23684210526315788, "acc_norm_stderr": 0.039994238792813344 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4689655172413793, "acc_stderr": 0.04158632762097828, "acc_norm": 0.4689655172413793, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.30158730158730157, "acc_stderr": 0.0236369759961018, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.0236369759961018 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.2619047619047619, "acc_stderr": 0.0393253768039287, "acc_norm": 0.2619047619047619, "acc_norm_stderr": 0.0393253768039287 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5096774193548387, "acc_stderr": 0.02843867799890955, "acc_norm": 0.5096774193548387, "acc_norm_stderr": 0.02843867799890955 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3793103448275862, "acc_stderr": 0.03413963805906235, "acc_norm": 0.3793103448275862, "acc_norm_stderr": 0.03413963805906235 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.5454545454545454, "acc_stderr": 0.03888176921674101, "acc_norm": 0.5454545454545454, "acc_norm_stderr": 0.03888176921674101 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5858585858585859, "acc_stderr": 0.03509438348879629, "acc_norm": 0.5858585858585859, "acc_norm_stderr": 0.03509438348879629 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.5699481865284974, "acc_stderr": 0.03572954333144808, "acc_norm": 0.5699481865284974, "acc_norm_stderr": 0.03572954333144808 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.3974358974358974, "acc_stderr": 0.024811920017903836, "acc_norm": 0.3974358974358974, "acc_norm_stderr": 0.024811920017903836 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.23333333333333334, "acc_stderr": 0.02578787422095931, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.02578787422095931 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0



