open-llm-leaderboard-old/details_FINGU-AI__FinguAI-Chat-v1
收藏数据集概述
数据集简介
该数据集是在对模型 FINGU-AI/FinguAI-Chat-v1 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 数据来源:数据集从1次运行中创建,每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- 最新结果:"train" 分割始终指向最新的结果。
- 结果汇总:一个额外的配置 "results" 存储所有运行的汇总结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_FINGU-AI__FinguAI-Chat-v1", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-30T16:01:46.276277 运行的最新结果:
python { "all": { "acc": 0.3035096239262173, "acc_stderr": 0.03240600643656583, "acc_norm": 0.3060120270192622, "acc_norm_stderr": 0.033223801457801135, "mc1": 0.2521419828641371, "mc1_stderr": 0.01520152224629997, "mc2": 0.4279230644927746, "mc2_stderr": 0.014980700973553645 }, "harness|arc:challenge|25": { "acc": 0.25426621160409557, "acc_stderr": 0.01272499994515774, "acc_norm": 0.29180887372013653, "acc_norm_stderr": 0.013284525292403503 }, "harness|hellaswag|10": { "acc": 0.3567018522206732, "acc_stderr": 0.004780467270911761, "acc_norm": 0.44084843656642103, "acc_norm_stderr": 0.004954740808837202 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.24444444444444444, "acc_stderr": 0.037125378336148665, "acc_norm": 0.24444444444444444, "acc_norm_stderr": 0.037125378336148665 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.3355263157894737, "acc_stderr": 0.038424985593952694, "acc_norm": 0.3355263157894737, "acc_norm_stderr": 0.038424985593952694 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.41, "acc_stderr": 0.04943110704237102, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.3320754716981132, "acc_stderr": 0.02898545565233439, "acc_norm": 0.3320754716981132, "acc_norm_stderr": 0.02898545565233439 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.3055555555555556, "acc_stderr": 0.03852084696008534, "acc_norm": 0.3055555555555556, "acc_norm_stderr": 0.03852084696008534 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.35260115606936415, "acc_stderr": 0.036430371689585475, "acc_norm": 0.35260115606936415, "acc_norm_stderr": 0.036430371689585475 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.28431372549019607, "acc_stderr": 0.04488482852329017, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.24, "acc_stderr": 0.04292346959909284, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909284 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.225531914893617, "acc_stderr": 0.027321078417387536, "acc_norm": 0.225531914893617, "acc_norm_stderr": 0.027321078417387536 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.23684210526315788, "acc_stderr": 0.039994238792813344, "acc_norm": 0.23684210526315788, "acc_norm_stderr": 0.039994238792813344 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2689655172413793, "acc_stderr": 0.036951833116502325, "acc_norm": 0.2689655172413793, "acc_norm_stderr": 0.036951833116502325 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2619047619047619, "acc_stderr": 0.022644212615525214, "acc_norm": 0.2619047619047619, "acc_norm_stderr": 0.022644212615525214 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3492063492063492, "acc_stderr": 0.04263906892795132, "acc_norm": 0.3492063492063492, "acc_norm_stderr": 0.04263906892795132 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.19, "acc_stderr": 0.039427724440366234, "acc_norm": 0.19, "acc_norm_stderr": 0.039427724440366234 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.3161290322580645, "acc_stderr": 0.026450874489042764, "acc_norm": 0.3161290322580645, "acc_norm_stderr": 0.026450874489042764 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.30049261083743845, "acc_stderr": 0.03225799476233485, "acc_norm": 0.30049261083743845, "acc_norm_stderr": 0.03225799476233485 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.34545454545454546, "acc_stderr": 0.03713158067481912, "acc_norm": 0.34545454545454546, "acc_norm_stderr": 0.03713158067481912 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.3939393939393939, "acc_stderr": 0.03481285338232963, "acc_norm": 0.3939393939393939, "acc_norm_stderr": 0.03481285338232963 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.37823834196891193, "acc_stderr": 0.03499807276193339, "acc_norm": 0.37823834196891193, "acc_norm_stderr": 0.03499807276193339 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.35384615384615387, "acc_stderr": 0.024243783994062167, "acc_norm": 0.35384615384615387, "acc_norm_stderr": 0.024243783994062167 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.26296296296296295, "acc_stderr": 0.026842057873833706, "acc_norm": 0.26296296296296295, "acc_norm_stderr": 0.026842057873833706 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.36134453781512604, "acc_stderr": 0.031204691225150006, "acc_norm": 0.36134453781512604, "acc_norm_stderr": 0.031204691225150006 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.33112582781456956, "acc_stderr": 0.038425817186598696, "acc_norm": 0.33112582781456956, "acc_norm_stderr": 0.038425817186598696 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.363302752293578, "acc_stderr": 0.020620603919625807, "acc_norm": 0.363302752293578, "acc_norm_stderr": 0.020620603919625807 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.4722222222222222, "acc_stderr": 0.0340470532865388, "acc_norm": 0.4722222222222222, "acc_norm_stderr": 0.0340470532865388 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.3235294117647059, "acc_stderr": 0.03283472056108567, "acc_norm": 0.3235294117647059, "acc_norm_stderr": 0.03283472056108567 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.33755274261603374, "acc_stderr": 0.03078154910202621, "acc_norm": 0.33755274261603374, "acc_norm_stderr": 0.03078154910202621 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.15246636771300448, "acc_stderr": 0.024126204813252877, "acc_norm": 0.15246636771300448, "acc_norm_stderr": 0.024126204813252877 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.3053435114503817, "acc_stderr": 0.04039314978724561, "acc_norm": 0.3053435114503817, "acc_norm_stderr": 0.04039314978724561 }, "harness|hendrycksTest-international_law|5": { "acc": 0.2727272727272727, "acc_stderr": 0.04065578140908705, "acc_norm": 0.2727272727272727, "acc_norm_stderr": 0.04065578140908705 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.32407407407407407, "acc_stderr": 0.045245960070300496, "acc_norm": 0.32407407407407407, "acc_norm_stderr": 0.045245960070300496 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.25766871165644173, "acc_stderr": 0.03436150827846917, "acc_norm": 0.25766871165644173, "acc_norm_stderr": 0.03436150827846917 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.22321428571428573, "acc_stderr": 0.039523019677025116, "acc_norm": 0.22321428571428573, "acc_norm_stderr": 0.039523019677025116 }, "harness|hendrycksTest-management|5": { "acc": 0.3786407766990291, "acc_stderr": 0.04802694698258972, "acc_norm": 0.3786407766990291, "acc_norm_stderr": 0.04802694698258972 }, "harness|hendrycksTest-marketing|5": { "acc": 0.3803418803418803, "acc_stderr": 0.03180425204384099, "acc_norm": 0.3803418803418803, "acc_norm_stderr": 0.03180425204384099 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.2554278416347382, "acc_stderr": 0.015594955384455773, "acc_norm": 0.2554278416347382, "acc_norm_stderr": 0.015594955384455773 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.2947976878612717, "acc_stderr": 0.024547617794803835, "acc_norm": 0.2947976878612717, "acc_norm_stderr": 0.024547617794803835 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.27262569832402234, "acc_stderr": 0.014893391735249588, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.3202614379084967, "acc_stderr": 0.026716118380156847, "acc_norm": 0.3202614379084967, "acc_norm_stderr": 0.026716118380156847 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.24437299035369775, "acc_stderr": 0.024406162094668886, "acc_norm": 0.24437299035369775, "acc_norm_stderr": 0.024406162094668886 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.27469135802469136, "acc_stderr": 0.024836057868294677, "acc_norm": 0.27469135802469136, "acc_norm_stderr": 0.024836057868294677 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.25886524822695034, "acc_stderr": 0.026129572527180848, "acc_norm": 0.25886524822695034, "acc_norm_stderr": 0.026129572527180848 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.2770534550195567, "acc_stderr": 0.011430462443719683, "acc_norm": 0.2770534550195567, "acc_norm_stderr": 0.011430462443719683 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.4375, "acc_stderr": 0.030134614954403924, "acc_norm": 0.4375, "acc_norm_stderr": 0.030134614954403924 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.2581699346405229, "acc_stderr": 0.017704531653250075, "acc_norm": 0.2581699346405229, "acc_norm_stderr": 0.017704531653250075 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.24545454545454545, "acc_stderr": 0.041220665028782834, "acc_norm": 0.24545454545454545, "acc_norm_stderr": 0.041220665028782834 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.2653061224489796, "acc_stderr": 0.028263889943784596, "acc_norm": 0.2653061224489796, "acc_norm_stderr": 0.028263889943784596 }, "harness|hendrycksTest-sociology|5": { "acc": 0.34328358208955223, "acc_stderr": 0.03357379665433431, "acc_norm": 0.34328358208955223, "acc_norm_stderr": 0.03357379665433431 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-virology|5": { "acc": 0.22289156626506024, "acc_stderr": 0.03240004825594689, "acc_norm": 0.22289156626506024, "acc_norm_stderr": 0.03240004825594689 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.23976608187134502, "acc_stderr": 0.03274485211946956, "acc_norm": 0.23976608187134502, "acc_norm_stderr": 0.03274485211946956 }, "harness|truthfulqa:mc|0": { "mc1": 0.2521419828641371, "mc1_stderr": 0.01520152224629997, "mc2": 0.4279230644927746, "mc2_stderr": 0.014980700973553645 }, "harness|winogrande|5": { "acc": 0.5659037095501184, "acc_stderr": 0.013929882555694044 }, "harness|gsm8k|5": { "acc": 0.015163002274450341, "acc_stderr": 0.0033660229497263455 } }
配置详情
-
harness_arc_challenge_25
- 分割:2024_03_30T16_01_46.276277
- 路径:
**/details_harness|arc:challenge|25_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:latest
- 路径:
**/details_harness|arc:challenge|25_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:2024_03_30T16_01_46.276277
-
harness_gsm8k_5
- 分割:2024_03_30T16_01_46.276277
- 路径:
**/details_harness|gsm8k|5_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:latest
- 路径:
**/details_harness|gsm8k|5_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:2024_03_30T16_01_46.276277
-
harness_hellaswag_10
- 分割:2024_03_30T16_01_46.276277
- 路径:
**/details_harness|hellaswag|10_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:latest
- 路径:
**/details_harness|hellaswag|10_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:2024_03_30T16_01_46.276277
-
harness_hendrycksTest_5
- 分割:2024_03_30T16_01_46.276277
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-anatomy|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-astronomy|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-college_biology|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-college_physics|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-computer_security|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-econometrics|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-formal_logic|5_2024-03-30T16-01-46.276277.parquet**/details_harness|hendrycksTest-global_facts|5_2024-03-30T16-01-46.276277.parquet
- 路径:
- 分割:2024_03_30T16_01_46.276277



