open-llm-leaderboard-old/details_freeCS-dot-org__ThetaZero-7B-1
收藏数据集概述
数据集摘要
该数据集是在评估模型freeCS-dot-org/ThetaZero-7B-1在Open LLM Leaderboard上的运行过程中自动创建的。数据集包含63个配置,每个配置对应一个评估任务。数据集从1次运行中创建,每次运行的详细结果可以在每个配置中找到,使用运行的时间戳作为分割名称。"train"分割始终指向最新的结果。
数据集结构
数据集包含多个配置,每个配置对应一个评估任务。以下是部分配置的详细信息:
-
harness_arc_challenge_25:
- 分割:
2024_01_21T17_39_05.540734,latest - 路径:
**/details_harness|arc:challenge|25_2024-01-21T17-39-05.540734.parquet
- 分割:
-
harness_gsm8k_5:
- 分割:
2024_01_21T17_39_05.540734,latest - 路径:
**/details_harness|gsm8k|5_2024-01-21T17-39-05.540734.parquet
- 分割:
-
harness_hellaswag_10:
- 分割:
2024_01_21T17_39_05.540734,latest - 路径:
**/details_harness|hellaswag|10_2024-01-21T17-39-05.540734.parquet
- 分割:
-
harness_hendrycksTest_5:
- 分割:
2024_01_21T17_39_05.540734,latest - 路径: 包含多个子任务的详细路径
- 分割:
最新结果
以下是最新结果的部分摘要:
python { "all": { "acc": 0.6324574731797497, "acc_stderr": 0.03252185227458061, "acc_norm": 0.634882895011124, "acc_norm_stderr": 0.03317372855494942, "mc1": 0.45777233782129745, "mc1_stderr": 0.01744096571248212, "mc2": 0.6248252798346551, "mc2_stderr": 0.015549488905596676 }, "harness|arc:challenge|25": { "acc": 0.628839590443686, "acc_stderr": 0.014117971901142822, "acc_norm": 0.6749146757679181, "acc_norm_stderr": 0.013688147309729122 }, "harness|hellaswag|10": { "acc": 0.6672973511252739, "acc_stderr": 0.004702181042215892, "acc_norm": 0.8569010157339175, "acc_norm_stderr": 0.0034945810763985386 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.045126085985421296, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421296 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6, "acc_stderr": 0.04232073695151589, "acc_norm": 0.6, "acc_norm_stderr": 0.04232073695151589 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7302631578947368, "acc_stderr": 0.03611780560284898, "acc_norm": 0.7302631578947368, "acc_norm_stderr": 0.03611780560284898 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6830188679245283, "acc_stderr": 0.02863723563980089, "acc_norm": 0.6830188679245283, "acc_norm_stderr": 0.02863723563980089 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7291666666666666, "acc_stderr": 0.03716177437566017, "acc_norm": 0.7291666666666666, "acc_norm_stderr": 0.03716177437566017 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252604 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.630057803468208, "acc_stderr": 0.0368122963339432, "acc_norm": 0.630057803468208, "acc_norm_stderr": 0.0368122963339432 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.35294117647058826, "acc_stderr": 0.047551296160629475, "acc_norm": 0.35294117647058826, "acc_norm_stderr": 0.047551296160629475 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5531914893617021, "acc_stderr": 0.0325005368436584, "acc_norm": 0.5531914893617021, "acc_norm_stderr": 0.0325005368436584 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4298245614035088, "acc_stderr": 0.04657047260594963, "acc_norm": 0.4298245614035088, "acc_norm_stderr": 0.04657047260594963 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5862068965517241, "acc_stderr": 0.04104269211806232, "acc_norm": 0.5862068965517241, "acc_norm_stderr": 0.04104269211806232 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42592592592592593, "acc_stderr": 0.025467149045469546, "acc_norm": 0.42592592592592593, "acc_norm_stderr": 0.025467149045469546 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.47619047619047616, "acc_stderr": 0.04467062628403273, "acc_norm": 0.47619047619047616, "acc_norm_stderr": 0.04467062628403273 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7290322580645161, "acc_stderr": 0.025284416114900156, "acc_norm": 0.7290322580645161, "acc_norm_stderr": 0.025284416114900156 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5024630541871922, "acc_stderr": 0.03517945038691063, "acc_norm": 0.5024630541871922, "acc_norm_stderr": 0.03517945038691063 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.031922715695483016, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.031922715695483016 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7727272727272727, "acc_stderr": 0.02985751567338642, "acc_norm": 0.7727272727272727, "acc_norm_stderr": 0.02985751567338642 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8601036269430051, "acc_stderr": 0.025033870583015178, "acc_norm": 0.8601036269430051, "acc_norm_stderr": 0.025033870583015178 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6538461538461539, "acc_stderr": 0.02



