open-llm-leaderboard-old/details_cloudyu__TomGrc_FusionNet_34Bx2_MoE_v0.1_full_linear_DPO
收藏数据集概述
该数据集是在对模型 cloudyu/TomGrc_FusionNet_34Bx2_MoE_v0.1_full_linear_DPO 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集由63个配置组成,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。
- "train" 拆分始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_cloudyu__TomGrc_FusionNet_34Bx2_MoE_v0.1_full_linear_DPO", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果 的摘要: python { "all": { "acc": 0.7649892778549832, "acc_stderr": 0.02823313368050758, "acc_norm": 0.7681511495490131, "acc_norm_stderr": 0.028777527908042073, "mc1": 0.5458996328029376, "mc1_stderr": 0.017429593091323522, "mc2": 0.7131962651033679, "mc2_stderr": 0.014139525056193024 }, "harness|arc:challenge|25": { "acc": 0.7167235494880546, "acc_stderr": 0.013167478735134575, "acc_norm": 0.7406143344709898, "acc_norm_stderr": 0.012808273573927097 }, "harness|hellaswag|10": { "acc": 0.6703843855805617, "acc_stderr": 0.004691128722535485, "acc_norm": 0.8666600278828919, "acc_norm_stderr": 0.003392470498816845 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.7555555555555555, "acc_stderr": 0.03712537833614866, "acc_norm": 0.7555555555555555, "acc_norm_stderr": 0.03712537833614866 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.875, "acc_stderr": 0.026913523521537846, "acc_norm": 0.875, "acc_norm_stderr": 0.026913523521537846 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.77, "acc_stderr": 0.04229525846816505, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816505 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.8037735849056604, "acc_stderr": 0.024442388131100813, "acc_norm": 0.8037735849056604, "acc_norm_stderr": 0.024442388131100813 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.9027777777777778, "acc_stderr": 0.024774516250440182, "acc_norm": 0.9027777777777778, "acc_norm_stderr": 0.024774516250440182 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.61, "acc_stderr": 0.049020713000019756, "acc_norm": 0.61, "acc_norm_stderr": 0.049020713000019756 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7109826589595376, "acc_stderr": 0.034564257450869995, "acc_norm": 0.7109826589595376, "acc_norm_stderr": 0.034564257450869995 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5196078431372549, "acc_stderr": 0.04971358884367406, "acc_norm": 0.5196078431372549, "acc_norm_stderr": 0.04971358884367406 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.79, "acc_stderr": 0.04093601807403326, "acc_norm": 0.79, "acc_norm_stderr": 0.04093601807403326 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7574468085106383, "acc_stderr": 0.028020226271200217, "acc_norm": 0.7574468085106383, "acc_norm_stderr": 0.028020226271200217 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5964912280701754, "acc_stderr": 0.04615186962583707, "acc_norm": 0.5964912280701754, "acc_norm_stderr": 0.04615186962583707 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7517241379310344, "acc_stderr": 0.036001056927277696, "acc_norm": 0.7517241379310344, "acc_norm_stderr": 0.036001056927277696 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.7486772486772487, "acc_stderr": 0.0223404823396439, "acc_norm": 0.7486772486772487, "acc_norm_stderr": 0.0223404823396439 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5158730158730159, "acc_stderr": 0.044698818540726076, "acc_norm": 0.5158730158730159, "acc_norm_stderr": 0.044698818540726076 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.9064516129032258, "acc_stderr": 0.016565754668270982, "acc_norm": 0.9064516129032258, "acc_norm_stderr": 0.016565754668270982 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6699507389162561, "acc_stderr": 0.033085304262282574, "acc_norm": 0.6699507389162561, "acc_norm_stderr": 0.033085304262282574 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.77, "acc_stderr": 0.042295258468165044, "acc_norm": 0.77, "acc_norm_stderr": 0.042295258468165044 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8848484848484849, "acc_stderr": 0.024925699798115344, "acc_norm": 0.8848484848484849, "acc_norm_stderr": 0.024925699798115344 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9343434343434344, "acc_stderr": 0.017646526677233335, "acc_norm": 0.9343434343434344, "acc_norm_stderr": 0.017646526677233335 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9740932642487047, "acc_stderr": 0.011464523356953162, "acc_norm": 0.9740932642487047, "acc_norm_stderr": 0.011464523356953162 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.8102564102564103, "acc_stderr": 0.019880165406588796, "acc_norm": 0.8102564102564103, "acc_norm_stderr": 0.019880165406588796 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.45925925925925926, "acc_stderr": 0.030384169232350832, "acc_norm": 0.459



