open-llm-leaderboard-old/details_jondurbin__bagel-dpo-34b-v0.2
收藏数据集概述
该数据集是在对模型 jondurbin/bagel-dpo-34b-v0.2 进行评估运行期间自动创建的。数据集包含63个配置,每个配置对应一个评估任务。数据集由2次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
数据集结构
数据集包含以下配置:
-
harness_arc_challenge_25
- 分割:2024_01_05T04_10_08.473090
- 分割:2024_01_05T04_16_58.738953
- 分割:latest
-
harness_gsm8k_5
- 分割:2024_01_05T04_10_08.473090
- 分割:2024_01_05T04_16_58.738953
- 分割:latest
-
harness_hellaswag_10
- 分割:2024_01_05T04_10_08.473090
- 分割:2024_01_05T04_16_58.738953
- 分割:latest
-
harness_hendrycksTest_5
- 分割:2024_01_05T04_10_08.473090
- 分割:2024_01_05T04_16_58.738953
- 分割:latest
最新结果
以下是2024年1月5日04:16:58.738953运行的最新结果:
python { "all": { "acc": 0.7613608627936146, "acc_stderr": 0.028274274385660204, "acc_norm": 0.7665014924179901, "acc_norm_stderr": 0.028800772478207726, "mc1": 0.5336597307221542, "mc1_stderr": 0.017463793867168106, "mc2": 0.7005121569261619, "mc2_stderr": 0.014305944779045657 }, "harness|arc:challenge|25": { "acc": 0.6902730375426621, "acc_stderr": 0.013512058415238363, "acc_norm": 0.7192832764505119, "acc_norm_stderr": 0.013131238126975578 }, "harness|hellaswag|10": { "acc": 0.6579366660027883, "acc_stderr": 0.004734311435009195, "acc_norm": 0.8525194184425413, "acc_norm_stderr": 0.0035385967737048152 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.7407407407407407, "acc_stderr": 0.03785714465066653, "acc_norm": 0.7407407407407407, "acc_norm_stderr": 0.03785714465066653 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.875, "acc_stderr": 0.026913523521537846, "acc_norm": 0.875, "acc_norm_stderr": 0.026913523521537846 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.78, "acc_stderr": 0.04163331998932262, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932262 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.8075471698113208, "acc_stderr": 0.024262979839372274, "acc_norm": 0.8075471698113208, "acc_norm_stderr": 0.024262979839372274 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.9027777777777778, "acc_stderr": 0.024774516250440182, "acc_norm": 0.9027777777777778, "acc_norm_stderr": 0.024774516250440182 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.63, "acc_stderr": 0.048523658709391, "acc_norm": 0.63, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7514450867052023, "acc_stderr": 0.03295304696818317, "acc_norm": 0.7514450867052023, "acc_norm_stderr": 0.03295304696818317 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5588235294117647, "acc_stderr": 0.049406356306056595, "acc_norm": 0.5588235294117647, "acc_norm_stderr": 0.049406356306056595 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036845, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036845 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.774468085106383, "acc_stderr": 0.027321078417387536, "acc_norm": 0.774468085106383, "acc_norm_stderr": 0.027321078417387536 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5877192982456141, "acc_stderr": 0.04630653203366596, "acc_norm": 0.5877192982456141, "acc_norm_stderr": 0.04630653203366596 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7310344827586207, "acc_stderr": 0.036951833116502325, "acc_norm": 0.7310344827586207, "acc_norm_stderr": 0.036951833116502325 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.7142857142857143, "acc_stderr": 0.02326651221373057, "acc_norm": 0.7142857142857143, "acc_norm_stderr": 0.02326651221373057 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.6190476190476191, "acc_stderr": 0.04343525428949097, "acc_norm": 0.6190476190476191, "acc_norm_stderr": 0.04343525428949097 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.9032258064516129, "acc_stderr": 0.016818943416345197, "acc_norm": 0.9032258064516129, "acc_norm_stderr": 0.016818943416345197 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6354679802955665, "acc_stderr": 0.0338640574606209, "acc_norm": 0.6354679802955665, "acc_norm_stderr": 0.0338640574606209 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8666666666666667, "acc_stderr": 0.026544435312706456, "acc_norm": 0.8666666666666667, "acc_norm_stderr": 0.026544435312706456 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9242424242424242, "acc_stderr": 0.018852670234993093, "acc_norm": 0.9242424242424242, "acc_norm_stderr": 0.018852670234993093 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9740932642487047, "acc_stderr": 0.011464523356953162, "acc_norm": 0.9740932642487047, "acc_norm_stderr": 0.011464523356953162 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.8179487179487179, "acc_stderr": 0.0195652367829309, "acc_norm": 0.8179487179487179, "acc_norm_stderr": 0.0195652367829309 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.4666666666666667, "acc_stderr": 0.0304177169



