open-llm-leaderboard-old/details_BarryFutureman__WestLakeX-7B-EvoMerge-Variant2
收藏数据集概述
该数据集是在对模型 BarryFutureman/WestLakeX-7B-EvoMerge-Variant2 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_BarryFutureman__WestLakeX-7B-EvoMerge-Variant2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-02T03:18:15.694379 运行的最新结果:
python { "all": { "acc": 0.6538297853321007, "acc_stderr": 0.0320522890373237, "acc_norm": 0.6530566018124656, "acc_norm_stderr": 0.03272874681048371, "mc1": 0.5618115055079559, "mc1_stderr": 0.01736923616440441, "mc2": 0.7034639754228852, "mc2_stderr": 0.014889031021791599 }, "harness|arc:challenge|25": { "acc": 0.7081911262798635, "acc_stderr": 0.013284525292403511, "acc_norm": 0.7252559726962458, "acc_norm_stderr": 0.013044617212771227 }, "harness|hellaswag|10": { "acc": 0.7144991037641903, "acc_stderr": 0.0045072961962278075, "acc_norm": 0.8851822346146186, "acc_norm_stderr": 0.003181503506054323 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6444444444444445, "acc_stderr": 0.04135176749720386, "acc_norm": 0.6444444444444445, "acc_norm_stderr": 0.04135176749720386 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.0373852067611967, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.0373852067611967 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7132075471698113, "acc_stderr": 0.027834912527544064, "acc_norm": 0.7132075471698113, "acc_norm_stderr": 0.027834912527544064 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7777777777777778, "acc_stderr": 0.03476590104304134, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.03476590104304134 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.55, "acc_stderr": 0.05, "acc_norm": 0.55, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6647398843930635, "acc_stderr": 0.03599586301247077, "acc_norm": 0.6647398843930635, "acc_norm_stderr": 0.03599586301247077 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.048971049527263666, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.048971049527263666 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5659574468085107, "acc_stderr": 0.03240038086792747, "acc_norm": 0.5659574468085107, "acc_norm_stderr": 0.03240038086792747 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5263157894736842, "acc_stderr": 0.046970851366478626, "acc_norm": 0.5263157894736842, "acc_norm_stderr": 0.046970851366478626 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5586206896551724, "acc_stderr": 0.04137931034482757, "acc_norm": 0.5586206896551724, "acc_norm_stderr": 0.04137931034482757 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.43915343915343913, "acc_stderr": 0.025559920550531003, "acc_norm": 0.43915343915343913, "acc_norm_stderr": 0.025559920550531003 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5079365079365079, "acc_stderr": 0.044715725362943486, "acc_norm": 0.5079365079365079, "acc_norm_stderr": 0.044715725362943486 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7806451612903226, "acc_stderr": 0.023540799358723295, "acc_norm": 0.7806451612903226, "acc_norm_stderr": 0.023540799358723295 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5024630541871922, "acc_stderr": 0.035179450386910616, "acc_norm": 0.5024630541871922, "acc_norm_stderr": 0.035179450386910616 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7818181818181819, "acc_stderr": 0.03225078108306289, "acc_norm": 0.7818181818181819, "acc_norm_stderr": 0.03225078108306289 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8080808080808081, "acc_stderr": 0.028057791672989017, "acc_norm": 0.8080808080808081, "acc_norm_stderr": 0.028057791672989017 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.02199531196364424, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.02199531196364424 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6692307692307692, "acc_stderr": 0.02385479568097112, "acc_norm": 0.6692307692307692, "acc_norm_stderr": 0.02385479568097112 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3592592592592593, "acc_stderr": 0.02925290592725197, "acc_norm": 0.3592592592592593, "acc_norm_stderr": 0.02925290592725197 }, "harness|hendrycksTest-high_school_microeconomics



