open-llm-leaderboard-old/details_genaicore3434__Mistral-7b-instruct-v0.2-summ-sft-bf16-e1
收藏数据集概述
数据集摘要
该数据集是在对模型 genaicore3434/Mistral-7b-instruct-v0.2-summ-sft-bf16-e1 进行评估运行时自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。数据集从 1 次运行中创建,每次运行的详细信息可以在每个配置中找到,使用运行的时间戳作为分割名称。"train" 分割始终指向最新的结果。
数据集加载
要加载数据集的详细信息,可以使用以下代码: python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_genaicore3434__Mistral-7b-instruct-v0.2-summ-sft-bf16-e1", "harness_winogrande_5", split="train")
最新结果
以下是来自最新运行 2024-01-21T09:04:23.323517 的结果:
python
{
"all": {
"acc": 0.6066822120870259,
"acc_stderr": 0.03318943094704429,
"acc_norm": 0.6116640035982913,
"acc_norm_stderr": 0.03386329646645478,
"mc1": 0.4847001223990208,
"mc1_stderr": 0.017495304473187902,
"mc2": 0.6471570787741905,
"mc2_stderr": 0.015262709036509099
},
"harness|arc:challenge|25": {
"acc": 0.5622866894197952,
"acc_stderr": 0.01449757388110828,
"acc_norm": 0.60580204778157,
"acc_norm_stderr": 0.01428052266746732
},
"harness|hellaswag|10": {
"acc": 0.637024497112129,
"acc_stderr": 0.004798751281560844,
"acc_norm": 0.8332005576578371,
"acc_norm_stderr": 0.0037203482062126898
},
"harness|hendrycksTest-abstract_algebra|5": {
"acc": 0.31,
"acc_stderr": 0.046482319871173156,
"acc_norm": 0.31,
"acc_norm_stderr": 0.046482319871173156
},
"harness|hendrycksTest-anatomy|5": {
"acc": 0.562962962962963,
"acc_stderr": 0.04284958639753401,
"acc_norm": 0.562962962962963,
"acc_norm_stderr": 0.04284958639753401
},
"harness|hendrycksTest-astronomy|5": {
"acc": 0.6118421052631579,
"acc_stderr": 0.03965842097512744,
"acc_norm": 0.6118421052631579,
"acc_norm_stderr": 0.03965842097512744
},
"harness|hendrycksTest-business_ethics|5": {
"acc": 0.57,
"acc_stderr": 0.04975698519562428,
"acc_norm": 0.57,
"acc_norm_stderr": 0.04975698519562428
},
"harness|hendrycksTest-clinical_knowledge|5": {
"acc": 0.6641509433962264,
"acc_stderr": 0.029067220146644826,
"acc_norm": 0.6641509433962264,
"acc_norm_stderr": 0.029067220146644826
},
"harness|hendrycksTest-college_biology|5": {
"acc": 0.6736111111111112,
"acc_stderr": 0.03921067198982266,
"acc_norm": 0.6736111111111112,
"acc_norm_stderr": 0.03921067198982266
},
"harness|hendrycksTest-college_chemistry|5": {
"acc": 0.41,
"acc_stderr": 0.04943110704237102,
"acc_norm": 0.41,
"acc_norm_stderr": 0.04943110704237102
},
"harness|hendrycksTest-college_computer_science|5": {
"acc": 0.55,
"acc_stderr": 0.05,
"acc_norm": 0.55,
"acc_norm_stderr": 0.05
},
"harness|hendrycksTest-college_mathematics|5": {
"acc": 0.37,
"acc_stderr": 0.04852365870939099,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"harness|hendrycksTest-college_medicine|5": {
"acc": 0.5838150289017341,
"acc_stderr": 0.03758517775404947,
"acc_norm": 0.5838150289017341,
"acc_norm_stderr": 0.03758517775404947
},
"harness|hendrycksTest-college_physics|5": {
"acc": 0.4117647058823529,
"acc_stderr": 0.048971049527263666,
"acc_norm": 0.4117647058823529,
"acc_norm_stderr": 0.048971049527263666
},
"harness|hendrycksTest-computer_security|5": {
"acc": 0.72,
"acc_stderr": 0.045126085985421276,
"acc_norm": 0.72,
"acc_norm_stderr": 0.045126085985421276
},
"harness|hendrycksTest-conceptual_physics|5": {
"acc": 0.5404255319148936,
"acc_stderr": 0.03257901482099835,
"acc_norm": 0.5404255319148936,
"acc_norm_stderr": 0.03257901482099835
},
"harness|hendrycksTest-econometrics|5": {
"acc": 0.4298245614035088,
"acc_stderr": 0.04657047260594963,
"acc_norm": 0.4298245614035088,
"acc_norm_stderr": 0.04657047260594963
},
"harness|hendrycksTest-electrical_engineering|5": {
"acc": 0.6137931034482759,
"acc_stderr": 0.04057324734419035,
"acc_norm": 0.6137931034482759,
"acc_norm_stderr": 0.04057324734419035
},
"harness|hendrycksTest-elementary_mathematics|5": {
"acc": 0.3994708994708995,
"acc_stderr": 0.025225450284067877,
"acc_norm": 0.3994708994708995,
"acc_norm_stderr": 0.025225450284067877
},
"harness|hendrycksTest-formal_logic|5": {
"acc": 0.40476190476190477,
"acc_stderr": 0.04390259265377563,
"acc_norm": 0.40476190476190477,
"acc_norm_stderr": 0.04390259265377563
},
"harness|hendrycksTest-global_facts|5": {
"acc": 0.35,
"acc_stderr": 0.047937248544110196,
"acc_norm": 0.35,
"acc_norm_stderr": 0.047937248544110196
},
"harness|hendrycksTest-high_school_biology|5": {
"acc": 0.6870967741935484,
"acc_stderr": 0.026377567028645858,
"acc_norm": 0.6870967741935484,
"acc_norm_stderr": 0.026377567028645858
},
"harness|hendrycksTest-high_school_chemistry|5": {
"acc": 0.5221674876847291,
"acc_stderr": 0.03514528562175008,
"acc_norm": 0.5221674876847291,
"acc_norm_stderr": 0.03514528562175008
},
"harness|hendrycksTest-high_school_computer_science|5": {
"acc": 0.63,
"acc_stderr": 0.04852365870939098,
"acc_norm": 0.63,
"acc_norm_stderr": 0.04852365870939098
},
"harness|hendrycksTest-high_school_european_history|5": {
"acc": 0.7575757575757576,
"acc_stderr": 0.03346409881055953,
"acc_norm": 0.7575757575757576,
"acc_norm_stderr": 0.03346409881055953
},
"harness|hendrycksTest-high_school_geography|5": {
"acc": 0.7727272727272727,
"acc_stderr": 0.02985751567338642,
"acc_norm": 0.7727272727272727,
"acc_norm_stderr": 0.02985751567338642
},
"harness|hendrycksTest-high_school_government_and_politics|5": {
"acc": 0.8549222797927462,
"acc_stderr": 0.02541634309630643,
"acc_norm": 0.8549222797927462,
"acc_norm_stderr": 0.02541634309630643
},
"harness|hendrycksTest-high_school_macroeconomics|5": {
"acc": 0.558974358974359,
"acc_stderr": 0.025174048384000745,
"acc_norm": 0.558974358974359,
"acc_norm_stderr": 0.025174048384000745
},
"harness|hendrycksTest-high_school_mathematics|5": {
"acc": 0.3296296296296296,
"acc_stderr": 0.02866120111652458,
"acc_norm": 0.3296296296296296,
"acc_norm_stderr": 0.02866120111652458
},
"harness|hendrycksTest-high_school_microeconomics|5": {
"acc": 0.634453781512605,
"acc_stderr": 0.031282177063684614,
"acc_norm": 0.634453781512605,
"acc_norm_stderr": 0.031282177063684614
},
"harness|hendrycksTest-high_school_physics|5": {
"acc": 0



