open-llm-leaderboard-old/details_NousResearch__Nous-Hermes-2-Mistral-7B-DPO
收藏数据集概述
数据集摘要
该数据集是在模型 NousResearch/Nous-Hermes-2-Mistral-7B-DPO 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 2 次运行中创建,每次运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_NousResearch__Nous-Hermes-2-Mistral-7B-DPO", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-21T00:48:36.188963 运行的最新结果:
python { "all": { "acc": 0.6347060781129819, "acc_stderr": 0.03219733375318615, "acc_norm": 0.63654655633088, "acc_norm_stderr": 0.03283944068311936, "mc1": 0.38555691554467564, "mc1_stderr": 0.017038839010591667, "mc2": 0.5578131545614541, "mc2_stderr": 0.015293843998956788 }, "harness|arc:challenge|25": { "acc": 0.6168941979522184, "acc_stderr": 0.014206472661672876, "acc_norm": 0.6569965870307167, "acc_norm_stderr": 0.013872423223718164 }, "harness|hellaswag|10": { "acc": 0.6608245369448317, "acc_stderr": 0.004724619193427586, "acc_norm": 0.8494323839872535, "acc_norm_stderr": 0.003568960247101678 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.045126085985421276, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6842105263157895, "acc_stderr": 0.0378272898086547, "acc_norm": 0.6842105263157895, "acc_norm_stderr": 0.0378272898086547 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6867924528301886, "acc_stderr": 0.028544793319055326, "acc_norm": 0.6867924528301886, "acc_norm_stderr": 0.028544793319055326 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7708333333333334, "acc_stderr": 0.03514697467862388, "acc_norm": 0.7708333333333334, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.46, "acc_stderr": 0.05009082659620333, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6069364161849711, "acc_stderr": 0.0372424959581773, "acc_norm": 0.6069364161849711, "acc_norm_stderr": 0.0372424959581773 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.39215686274509803, "acc_stderr": 0.048580835742663454, "acc_norm": 0.39215686274509803, "acc_norm_stderr": 0.048580835742663454 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.548936170212766, "acc_stderr": 0.032529096196131965, "acc_norm": 0.548936170212766, "acc_norm_stderr": 0.032529096196131965 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4649122807017544, "acc_stderr": 0.046920083813689104, "acc_norm": 0.4649122807017544, "acc_norm_stderr": 0.046920083813689104 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5310344827586206, "acc_stderr": 0.04158632762097828, "acc_norm": 0.5310344827586206, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42328042328042326, "acc_stderr": 0.02544636563440678, "acc_norm": 0.42328042328042326, "acc_norm_stderr": 0.02544636563440678 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42857142857142855, "acc_stderr": 0.04426266681379909, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.04426266681379909 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7967741935483871, "acc_stderr": 0.02289168798455495, "acc_norm": 0.7967741935483871, "acc_norm_stderr": 0.02289168798455495 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5123152709359606, "acc_stderr": 0.035169204442208966, "acc_norm": 0.5123152709359606, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.67, "acc_stderr": 0.047258156262526066, "acc_norm": 0.67, "acc_norm_stderr": 0.047258156262526066 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.793939393939394, "acc_stderr": 0.03158415324047711, "acc_norm": 0.793939393939394, "acc_norm_stderr": 0.03158415324047711 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7828282828282829, "acc_stderr": 0.029376616484945633, "acc_norm": 0.7828282828282829, "acc_norm_stderr": 0.029376616484945633 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8808290155440415, "acc_stderr": 0.023381935348121434, "acc_norm": 0.8808290155440415, "acc_norm_stderr": 0.023381935348121434 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6051282051282051, "acc_stderr": 0.024784316942156406, "acc_norm": 0.6051282051282051, "acc_norm_stderr": 0.024784316942156406 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3111111111111111, "acc_stderr": 0.028226446749683515, "acc_norm": 0.3111111111111111, "acc_norm_stderr": 0.028226446749683515 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6764705882352942, "acc_stderr": 0.0303883535518868, "acc_norm": 0.6764705882352942, "acc_norm_stderr": 0.0303883535518868 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.3509933774834437, "acc_stderr": 0.03896981964257375, "acc_norm": 0.3509933774834437, "acc_norm_stderr": 0.03896981964257375 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8311926605504587, "acc_stderr": 0.016060056268530336, "acc_norm": 0.8311926605504587, "acc_norm_stderr": 0.016060056268530336 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.49074074074074076, "acc_stderr": 0.034093869469927006, "acc_norm": 0.49074074074074076, "acc_norm_stderr": 0.034093869469927006 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.7941176470588235, "acc_stderr": 0.028379449451588663, "acc_norm": 0.7941176470588235, "acc_norm_stderr": 0.028379449451588663 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.810126582278481, "acc_stderr": 0.02553010046023349, "acc_norm": 0.810126582278481, "acc_norm_stderr": 0.02553010046023349 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.7085201793721974, "acc_stderr": 0.030500283176545843, "acc_norm": 0.7085201793721974, "acc_norm_stderr": 0.030500283176545843 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7709923664122137, "acc_stderr": 0.036853466317118506, "acc_norm": 0.7709923664122137, "acc_norm_stderr": 0.036853466317118506 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7768595041322314, "acc_stderr": 0.03800754475228732, "acc_norm": 0.7768595041322314, "acc_norm_stderr": 0.03800754475228732 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7777777777777778, "acc_stderr": 0.0401910747255735, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.0401910747255735 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7975460122699386, "acc_stderr": 0.031570650789119005, "acc_norm": 0.7975460122699386, "acc_norm_stderr": 0.031570650789119005 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.49107142857142855, "acc_stderr": 0.04745033255489123, "acc_norm": 0.49107142857142855, "acc_norm_stderr": 0.04745033255489123 }, "harness|hendrycksTest-management|5": { "acc": 0.7864077669902912, "acc_stderr": 0.040580420156460344, "acc_norm": 0.7864077669902912, "acc_norm_stderr": 0.040580420156460344 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8632478632478633, "acc_stderr": 0.02250903393707781, "acc_norm": 0.8632478632478633, "acc_norm_stderr": 0.02250903393707781 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8275862068965517, "acc_stderr": 0.013507943909371803, "acc_norm": 0.8275862068965517, "acc_norm_stderr": 0.013507943909371803 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7138728323699421, "acc_stderr": 0.02433214677913413, "acc_norm": 0.7138728323699421, "acc_norm_stderr": 0.02433214677913413 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.2860335195530726, "acc_stderr": 0.015113972129062143, "acc_norm": 0.2860335195530726, "acc_norm_stderr": 0.015113972129062143 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7418300653594772, "acc_stderr": 0.02505850331695814, "acc_norm": 0.7418300653594772, "acc_norm_stderr": 0.02505850331695814 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6816720257234726, "acc_stderr": 0.026457225067811025, "acc_norm": 0.6816720257234726, "acc_norm_stderr": 0.026457225067811025 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.75, "acc_stderr": 0.02409347123262133, "acc_norm": 0.75, "acc_norm_stderr": 0.02409347123262133 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.5070921985815603, "acc_stderr": 0.02982449855912901, "acc_norm": 0.5070921985815603, "acc_norm_stderr": 0.02982449855912901 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.46740547588005216, "acc_stderr": 0.012743072942653342, "acc_norm": 0.46740547588005216, "acc_norm_stderr": 0.012743072942653342 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6727941176470589, "acc_stderr": 0.02850145286039655, "acc_norm": 0.6727941176470589, "acc_norm_stderr": 0.02850145286039655 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6715686274509803, "acc_stderr": 0.018999707383162666, "acc_norm": 0.6715686274509803, "acc_norm_stderr": 0.018999707383162666 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6727272727272727, "acc_stderr": 0.04494290866252091, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.04494290866252091 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7387755102040816, "acc_stderr": 0.028123429335142773, "acc_norm": 0.7387755102040816, "acc_norm_stderr": 0.028123429335142773 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8059701492537313, "acc_stderr": 0.027962677604768914, "acc_norm": 0.8059701492537313, "acc_norm_stderr": 0.027962677604768914 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.88, "acc_stderr": 0.032659863237109066, "acc_norm": 0.88, "acc_norm_stderr": 0.032659863237109066 }, "harness|hendrycksTest-virology|5": { "acc": 0.5602409638554217, "acc_stderr": 0.03864139923699122, "acc_norm": 0.5602409638554217, "acc_norm_stderr": 0.03864139923699122 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8304093567251462, "acc_stderr": 0.02878210810540171, "acc_norm": 0.8304093567251462, "acc_norm_stderr": 0.02878210810540171 }, "harness|truthfulqa:mc|0": { "mc1": 0.38555691554467564, "mc1_stderr": 0.017038839010591667, "mc2": 0.5578131545614541, "mc2_stderr": 0.015293843998956788 }, "harness|winogrande|5": { "acc": 0.7845303867403315, "acc_stderr": 0.011555295286059282 }, "harness|gsm8k|5": { "acc": 0.6050037907505686, "acc_stderr": 0.0134653549699732 } }
配置详情
-
config_name: harness_arc_challenge_25
- 分割:
- 2024_02_21T00_41_28.762847
- 路径:
**/details_harness|arc:challenge|25_2024-02-21T00-41-28.762847.parquet
- 路径:
- 2024_02_21T00_48_36.188963
- 路径:
**/details_harness|arc:challenge|25_2024-02-21T00-48-36.188963.parquet
- 路径:
- latest
- 路径:
**/details_harness|arc:challenge|25_2024-02-21T00-48-36.188963.parquet
- 路径:
- 2024_02_21T00_41_28.762847
- 分割:
-
config_name: harness_gsm8k_5
- 分割:
- 2024_02_21T00_41_28.762847
- 路径:
**/details_harness|gsm8k|5_2024-02-21T00-41-28.762847.parquet
- 路径:
- 2024_02_21T00_48_36.188963
- 路径:
**/details_harness|gsm8k|5_2024-02-21T00-48-36.188963.parquet
- 路径:
- latest
- 路径:
**/details_harness|gsm8k|5_2024-02-21T00-48-36.188963.parquet
- 路径:
- 2024_02_21T00_41_28.762847
- 分割:
-
config_name: harness_hellaswag_10
- 分割:
- 2024_02_21T00_41_28.762847
- 路径:
**/details_harness|hellaswag|10_2024-02-21T00-41-28.762847.parquet
- 路径:
- 2024_02_21T00_48_36.188963
- 路径:
**/details_harness|hellaswag|10_2024-02-21T00-48-36.188963.parquet
- 路径:
- latest
- 路径:
**/details_harness|hellaswag|10_2024-02-21T00-48-36.188963.parquet
- 路径:
- 2024_02_21T00_41_28.762847
- 分割:
-
config_name: harness_hendrycksTest_5
- 分割:
- 2024_02_21T00_41_28.762847
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-anatomy|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-astronomy|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-college_biology|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-college_physics|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-computer_security|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-02-21T00-41-28.762847.parquet**/details_harness|hendrycksTest-econometrics|5_2024-02-21T00-41-28.762847.parquet
- 路径:
- 2024_02_21T00_41_28.762847
- 分割:




