open-llm-leaderboard-old/details_abhinand__tamil-llama-7b-instruct-v0.1
收藏数据集概述
数据集简介
该数据集是在评估模型abhinand/tamil-llama-7b-instruct-v0.1在Open LLM Leaderboard上的运行过程中自动创建的。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 额外的配置"results"存储所有运行结果的聚合,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_abhinand__tamil-llama-7b-instruct-v0.1", "harness_winogrande_5", split="train")
最新结果
以下是2023-12-16T14:51:51.361679运行的最新结果:
python { "all": { "acc": 0.4008335001051513, "acc_stderr": 0.034109238722098915, "acc_norm": 0.406154491278963, "acc_norm_stderr": 0.03498024209123229, "mc1": 0.2802937576499388, "mc1_stderr": 0.015723139524608767, "mc2": 0.41698962752686786, "mc2_stderr": 0.014679687695881056 }, "harness|arc:challenge|25": { "acc": 0.43600682593856654, "acc_stderr": 0.014491225699230916, "acc_norm": 0.4803754266211604, "acc_norm_stderr": 0.014600132075947087 }, "harness|hellaswag|10": { "acc": 0.5172276438956384, "acc_stderr": 0.004986818680313444, "acc_norm": 0.7097191794463255, "acc_norm_stderr": 0.004529642828546402 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4074074074074074, "acc_stderr": 0.04244633238353228, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.04244633238353228 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.40131578947368424, "acc_stderr": 0.03988903703336283, "acc_norm": 0.40131578947368424, "acc_norm_stderr": 0.03988903703336283 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.46037735849056605, "acc_stderr": 0.030676096599389184, "acc_norm": 0.46037735849056605, "acc_norm_stderr": 0.030676096599389184 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.3402777777777778, "acc_stderr": 0.03962135573486219, "acc_norm": 0.3402777777777778, "acc_norm_stderr": 0.03962135573486219 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.22, "acc_stderr": 0.041633319989322695, "acc_norm": 0.22, "acc_norm_stderr": 0.041633319989322695 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.41, "acc_stderr": 0.04943110704237102, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3699421965317919, "acc_stderr": 0.0368122963339432, "acc_norm": 0.3699421965317919, "acc_norm_stderr": 0.0368122963339432 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237656, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237656 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.3574468085106383, "acc_stderr": 0.03132941789476425, "acc_norm": 0.3574468085106383, "acc_norm_stderr": 0.03132941789476425 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2807017543859649, "acc_stderr": 0.042270544512322004, "acc_norm": 0.2807017543859649, "acc_norm_stderr": 0.042270544512322004 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.41379310344827586, "acc_stderr": 0.04104269211806232, "acc_norm": 0.41379310344827586, "acc_norm_stderr": 0.04104269211806232 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.30158730158730157, "acc_stderr": 0.023636975996101806, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.023636975996101806 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.35714285714285715, "acc_stderr": 0.042857142857142816, "acc_norm": 0.35714285714285715, "acc_norm_stderr": 0.042857142857142816 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.45806451612903226, "acc_stderr": 0.028343787250540625, "acc_norm": 0.45806451612903226, "acc_norm_stderr": 0.028343787250540625 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.31527093596059114, "acc_stderr": 0.03269080871970187, "acc_norm": 0.31527093596059114, "acc_norm_stderr": 0.03269080871970187 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.37575757575757573, "acc_stderr": 0.03781887353205983, "acc_norm": 0.37575757575757573, "acc_norm_stderr": 0.03781887353205983 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.4797979797979798, "acc_stderr": 0.03559443565563919, "acc_norm": 0.4797979797979798, "acc_norm_stderr": 0.03559443565563919 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.5803108808290155, "acc_stderr": 0.035615873276858834, "acc_norm": 0.5803108808290155, "acc_norm_stderr": 0.035615873276858834 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.3923076923076923, "acc_stderr": 0.02475600038213095, "acc_norm": 0.3923076923076923, "acc_norm_stderr": 0.02475600038213095 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.24444444444444444, "acc_stderr": 0.02620276653465215, "acc_norm": 0.24444444444444444, "acc_norm_stderr": 0.02620276653465215 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.38235294117647056, "acc_stderr": 0.03156663099215416, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.03156663099215416 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.23178807947019867, "acc_stderr": 0.03445406271987054, "acc_norm": 0.23178807947019867, "acc_norm_stderr": 0.03445406271987054 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.5009174311926605, "acc_stderr": 0.021437287056051215, "acc_norm": 0.5009174311926605, "acc_norm_stderr": 0.021437287056051215 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.4305555555555556, "acc_stderr": 0.03376922151252336, "acc_norm": 0.4305555555555556, "acc_norm_stderr": 0.03376922151252336 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.4362745098039216, "acc_stderr": 0.03480693138457038, "acc_norm": 0.4362745098039216, "acc_norm_stderr": 0.03480693138457038 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.5274261603375527, "acc_stderr": 0.032498227183013026, "acc_norm": 0.5274261603375527, "acc_norm_stderr": 0.032498227183013026 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.3721973094170404, "acc_stderr": 0.03244305283008731, "acc_norm": 0.3721973094170404, "acc_norm_stderr": 0.03244305283008731 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.44274809160305345, "acc_stderr": 0.04356447202665069, "acc_norm": 0.44274809160305345, "acc_norm_stderr": 0.04356447202665069 }, "harness|hendrycksTest-international_law|5": { "acc": 0.628099173553719, "acc_stderr": 0.04412015806624504, "acc_norm": 0.628099173553719, "acc_norm_stderr": 0.04412015806624504 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.3611111111111111, "acc_stderr": 0.04643454608906274, "acc_norm": 0.3611111111111111, "acc_norm_stderr": 0.04643454608906274 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.34355828220858897, "acc_stderr": 0.03731133519673893, "acc_norm": 0.34355828220858897, "acc_norm_stderr": 0.03731133519673893 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.25, "acc_stderr": 0.04109974682633932, "acc_norm": 0.25, "acc_norm_stderr": 0.04109974682633932 }, "harness|hendrycksTest-management|5": { "acc": 0.5339805825242718, "acc_stderr": 0.0493929144727348, "acc_norm": 0.5339805825242718, "acc_norm_stderr": 0.0493929144727348 }, "harness|hendrycksTest-marketing|5": { "acc": 0.5470085470085471, "acc_stderr": 0.03261099873098619, "acc_norm": 0.5470085470085471, "acc_norm_stderr": 0.03261099873098619 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.5146871008939975, "acc_stderr": 0.017872248024429122, "acc_norm": 0.5146871008939975, "acc_norm_stderr": 0.017872248024429122 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.4046242774566474, "acc_stderr": 0.026424816594009852, "acc_norm": 0.4046242774566474, "acc_norm_stderr": 0.026424816594009852 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.23798882681564246, "acc_norm_stderr": 0.014242630070574915 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.4542483660130719, "acc_stderr": 0.02850980780262656, "acc_norm": 0.4542483660130719, "acc_norm_stderr": 0.02850980780262656 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.45016077170418006, "acc_stderr": 0.028256660723360187, "acc_norm": 0.45016077170418006, "acc_norm_stderr": 0.028256660723360187 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.4691358024691358, "acc_stderr": 0.027767689606833935, "acc_norm": 0.4691358024691358, "acc_norm_stderr": 0.027767689606833935 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.2765957446808511, "acc_stderr": 0.026684564340460994, "acc_norm": 0.2765957446808511, "acc_norm_stderr": 0.026684564340460994 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.3246414602346806, "acc_stderr": 0.01195908938853002, "acc_norm": 0.3246414602346806, "acc_norm_stderr": 0.01195908938853002 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.4485294117647059, "acc_stderr": 0.030211479609121593, "acc_norm": 0.4485294117647059, "acc_norm_stderr": 0.030211479609121593 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.369281045751634, "acc_stderr": 0.019524316744866342, "acc_norm": 0.369281045751634, "acc_norm_stderr": 0.019524316744866342 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.37272727272727274, "acc_stderr": 0.04631381319425463, "acc_norm": 0.37272727272727274, "acc_norm_stderr": 0.04631381319425463 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.4163265306122449, "acc_stderr": 0.03155782816556165, "acc_norm": 0.4163265306122449, "acc_norm_stderr": 0.03155782816556165 }, "harness|hendrycksTest-sociology|5": { "acc": 0.4626865671641791, "acc_stderr": 0.03525675167467975, "acc_norm": 0.4626865671641791, "acc_norm_stderr": 0.03525675167467975 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-virology|5": { "acc": 0.3493975903614458, "acc_stderr": 0.03711725190740751, "acc_norm": 0.3493975903614458, "acc_norm_stderr": 0.03711725190740751 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.6023391812865497, "acc_stderr": 0.0375363895576169, "acc_norm": 0.6023391812865497, "acc_norm_stderr": 0.0375363895576169 }, "harness|truthfulqa:mc|0": { "mc1": 0.2802937576499388, "mc1_stderr": 0.015723139524608767, "mc2": 0.41698962752686786, "mc2_stderr": 0.014679687695881056 }, "harness|winogrande|5": { "acc": 0.7063930544593529, "acc_stderr": 0.012799397296204173 }, "harness|gsm8k|5": { "acc": 0.01819560272934041, "acc_stderr": 0.0036816118940738727 } }
配置文件
-
config_name: harness_arc_challenge_25split: 2023_12_16T14_51_51.361679path: **/details_harness|arc:challenge|25_2023-12-16T14-51-51.361679.parquet
split: latestpath: **/details_harness|arc:challenge|25_2023-12-16T14-51-51.361679.parquet
-
config_name: harness_gsm8k_5split: 2023_12_16T14_51_51.361679path: **/details_harness|gsm8k|5_2023-12-16T14-51-51.361679.parquet
split: latestpath: **/details_harness|gsm8k|5_2023-12-16T14-51-51.361679.parquet
-
config_name: harness_hellaswag_10split: 2023_12_16T14_51_51.361679path: **/details_harness|hellaswag|10_2023-12-16T14-51-51.361679.parquet
split: latestpath: **/details_harness|hellaswag|10_2023-12-16T14-51-51.361679.parquet
-
config_name: harness_hendrycksTest_5split: 2023_12_16T14_51_51.361679path: **/details_harness|hendrycksTest-abstract_algebra|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-anatomy|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-astronomy|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-business_ethics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-clinical_knowledge|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-college_biology|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-college_chemistry|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-college_computer_science|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-college_mathematics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-college_medicine|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-college_physics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-computer_security|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-conceptual_physics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-econometrics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-electrical_engineering|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-elementary_mathematics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-formal_logic|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-global_facts|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_biology|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_chemistry|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_computer_science|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_european_history|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_geography|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_government_and_politics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_macroeconomics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_mathematics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_microeconomics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_physics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_psychology|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_statistics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_us_history|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-high_school_world_history|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-human_aging|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-human_sexuality|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-international_law|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-jurisprudence|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-logical_fallacies|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-machine_learning|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-management|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-marketing|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-medical_genetics|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-miscellaneous|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-moral_disputes|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-moral_scenarios|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-nutrition|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-philosophy|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-prehistory|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-professional_accounting|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-professional_law|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-professional_medicine|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-professional_psychology|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-public_relations|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-security_studies|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-sociology|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-us_foreign_policy|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-virology|5_2023-12-16T14-51-51.361679.parquetpath: **/details_harness|hendrycksTest-world_religions|5_2023-12-16T14-51-51.361679.parquet




