open-llm-leaderboard-old/details_ppopiolek__tinyllama_merged_test
收藏数据集概述
数据集简介
该数据集是在对模型 ppopiolek/tinyllama_merged_test 进行评估运行期间自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。数据集来源于 1 次运行,每次运行的详细信息可以在每个配置中找到,并以运行的时间戳命名。"train" 分割始终指向最新的结果。
数据集结构
- 配置数量: 63
- 运行次数: 1
- 分割命名: 使用运行的时间戳
- "train" 分割: 指向最新结果
加载数据示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_ppopiolek__tinyllama_merged_test", "harness_winogrande_5", split="train")
最新结果
以下是来自 2024-04-19T00:22:49.313194 运行的最新结果:
python
{
"all": {
"acc": 0.2638324101167344,
"acc_stderr": 0.030960372509611613,
"acc_norm": 0.26500396200970283,
"acc_norm_stderr": 0.03171500244946668,
"mc1": 0.23990208078335373,
"mc1_stderr": 0.014948812679062137,
"mc2": 0.3871982556682638,
"mc2_stderr": 0.014284052060856063
},
"harness|arc:challenge|25": {
"acc": 0.35238907849829354,
"acc_stderr": 0.013960142600598678,
"acc_norm": 0.3720136518771331,
"acc_norm_stderr": 0.014124597881844461
},
"harness|hellaswag|10": {
"acc": 0.4599681338378809,
"acc_stderr": 0.0049737629483028,
"acc_norm": 0.6132244572794264,
"acc_norm_stderr": 0.00486016207633096
},
"harness|hendrycksTest-abstract_algebra|5": {
"acc": 0.24,
"acc_stderr": 0.04292346959909284,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909284
},
"harness|hendrycksTest-anatomy|5": {
"acc": 0.2,
"acc_stderr": 0.034554737023254366,
"acc_norm": 0.2,
"acc_norm_stderr": 0.034554737023254366
},
"harness|hendrycksTest-astronomy|5": {
"acc": 0.18421052631578946,
"acc_stderr": 0.0315469804508223,
"acc_norm": 0.18421052631578946,
"acc_norm_stderr": 0.0315469804508223
},
"harness|hendrycksTest-business_ethics|5": {
"acc": 0.23,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.23,
"acc_norm_stderr": 0.04229525846816506
},
"harness|hendrycksTest-clinical_knowledge|5": {
"acc": 0.2339622641509434,
"acc_stderr": 0.02605529690115292,
"acc_norm": 0.2339622641509434,
"acc_norm_stderr": 0.02605529690115292
},
"harness|hendrycksTest-college_biology|5": {
"acc": 0.2361111111111111,
"acc_stderr": 0.03551446610810826,
"acc_norm": 0.2361111111111111,
"acc_norm_stderr": 0.03551446610810826
},
"harness|hendrycksTest-college_chemistry|5": {
"acc": 0.36,
"acc_stderr": 0.048241815132442176,
"acc_norm": 0.36,
"acc_norm_stderr": 0.048241815132442176
},
"harness|hendrycksTest-college_computer_science|5": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.27,
"acc_norm_stderr": 0.044619604333847394
},
"harness|hendrycksTest-college_mathematics|5": {
"acc": 0.29,
"acc_stderr": 0.045604802157206845,
"acc_norm": 0.29,
"acc_norm_stderr": 0.045604802157206845
},
"harness|hendrycksTest-college_medicine|5": {
"acc": 0.2254335260115607,
"acc_stderr": 0.031862098516411426,
"acc_norm": 0.2254335260115607,
"acc_norm_stderr": 0.031862098516411426
},
"harness|hendrycksTest-college_physics|5": {
"acc": 0.20588235294117646,
"acc_stderr": 0.04023382273617747,
"acc_norm": 0.20588235294117646,
"acc_norm_stderr": 0.04023382273617747
},
"harness|hendrycksTest-computer_security|5": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.27,
"acc_norm_stderr": 0.044619604333847394
},
"harness|hendrycksTest-conceptual_physics|5": {
"acc": 0.251063829787234,
"acc_stderr": 0.028346963777162452,
"acc_norm": 0.251063829787234,
"acc_norm_stderr": 0.028346963777162452
},
"harness|hendrycksTest-econometrics|5": {
"acc": 0.24561403508771928,
"acc_stderr": 0.04049339297748142,
"acc_norm": 0.24561403508771928,
"acc_norm_stderr": 0.04049339297748142
},
"harness|hendrycksTest-electrical_engineering|5": {
"acc": 0.2689655172413793,
"acc_stderr": 0.03695183311650232,
"acc_norm": 0.2689655172413793,
"acc_norm_stderr": 0.03695183311650232
},
"harness|hendrycksTest-elementary_mathematics|5": {
"acc": 0.25132275132275134,
"acc_stderr": 0.022340482339643898,
"acc_norm": 0.25132275132275134,
"acc_norm_stderr": 0.022340482339643898
},
"harness|hendrycksTest-formal_logic|5": {
"acc": 0.20634920634920634,
"acc_stderr": 0.03619604524124252,
"acc_norm": 0.20634920634920634,
"acc_norm_stderr": 0.03619604524124252
},
"harness|hendrycksTest-global_facts|5": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.22,
"acc_norm_stderr": 0.04163331998932269
},
"harness|hendrycksTest-high_school_biology|5": {
"acc": 0.20967741935483872,
"acc_stderr": 0.023157879349083522,
"acc_norm": 0.20967741935483872,
"acc_norm_stderr": 0.023157879349083522
},
"harness|hendrycksTest-high_school_chemistry|5": {
"acc": 0.19704433497536947,
"acc_stderr": 0.027986724666736223,
"acc_norm": 0.19704433497536947,
"acc_norm_stderr": 0.027986724666736223
},
"harness|hendrycksTest-high_school_computer_science|5": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"harness|hendrycksTest-high_school_european_history|5": {
"acc": 0.2787878787878788,
"acc_stderr": 0.03501438706296781,
"acc_norm": 0.2787878787878788,
"acc_norm_stderr": 0.03501438706296781
},
"harness|hendrycksTest-high_school_geography|5": {
"acc": 0.23737373737373738,
"acc_stderr": 0.0303137105381989,
"acc_norm": 0.23737373737373738,
"acc_norm_stderr": 0.0303137105381989
},
"harness|hendrycksTest-high_school_government_and_politics|5": {
"acc": 0.30569948186528495,
"acc_stderr": 0.03324837939758159,
"acc_norm": 0.30569948186528495,
"acc_norm_stderr": 0.03324837939758159
},
"harness|hendrycksTest-high_school_macroeconomics|5": {
"acc": 0.26666666666666666,
"acc_stderr": 0.02242127361292371,
"acc_norm": 0.26666666666666666,
"acc_norm_stderr": 0.02242127361292371
},
"harness|hendrycksTest-high_school_mathematics|5": {
"acc": 0.24444444444444444,
"acc_stderr": 0.026202766534652148,
"acc_norm": 0.24444444444444444,
"acc_norm_stderr": 0.026202766534652148
},
"harness|hendrycksTest-high_school_microeconomics|5": {
"acc": 0.23109243697478993,
"acc_stderr": 0.02738140692786896,
"acc_norm": 0.23109243697478993,
"acc_norm_stderr": 0.02738140692786896
},
"



