open-llm-leaderboard-old/details_JunchengXie__Mistral-7B-Instruct-v0.2-gpt-4-80k-base_lora
收藏数据集概述
数据集简介
该数据集是在评估模型JunchengXie/Mistral-7B-Instruct-v0.2-gpt-4-80k-base_lora在Open LLM Leaderboard上的运行过程中自动创建的。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 数据来源:数据集从1次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- 最新结果:"train"分割始终指向最新结果。
- 汇总结果:一个额外的配置"results"存储所有运行的汇总结果,用于计算和显示在Open LLM Leaderboard上的汇总指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_JunchengXie__Mistral-7B-Instruct-v0.2-gpt-4-80k-base_lora", "harness_winogrande_5", split="train")
最新结果
以下是最新结果来自2024-03-27T23:59:10.991747:
python { "all": { "acc": 0.582160493650072, "acc_stderr": 0.03365366470977369, "acc_norm": 0.5887215545601854, "acc_norm_stderr": 0.03434852603389613, "mc1": 0.5177478580171359, "mc1_stderr": 0.017492470843075356, "mc2": 0.6831698375539644, "mc2_stderr": 0.015593330487456654 }, "harness|arc:challenge|25": { "acc": 0.5665529010238908, "acc_stderr": 0.014481376224558902, "acc_norm": 0.5947098976109215, "acc_norm_stderr": 0.014346869060229321 }, "harness|hellaswag|10": { "acc": 0.6102370045807608, "acc_stderr": 0.004866997110388195, "acc_norm": 0.7969527982473611, "acc_norm_stderr": 0.0040144524737232646 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.04560480215720683, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720683 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5777777777777777, "acc_stderr": 0.04266763404099582, "acc_norm": 0.5777777777777777, "acc_norm_stderr": 0.04266763404099582 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6118421052631579, "acc_stderr": 0.03965842097512744, "acc_norm": 0.6118421052631579, "acc_norm_stderr": 0.03965842097512744 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.56, "acc_stderr": 0.04988876515698589, "acc_norm": 0.56, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6452830188679245, "acc_stderr": 0.02944517532819959, "acc_norm": 0.6452830188679245, "acc_norm_stderr": 0.02944517532819959 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6388888888888888, "acc_stderr": 0.04016660030451233, "acc_norm": 0.6388888888888888, "acc_norm_stderr": 0.04016660030451233 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5549132947976878, "acc_stderr": 0.03789401760283647, "acc_norm": 0.5549132947976878, "acc_norm_stderr": 0.03789401760283647 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726368, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726368 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.48936170212765956, "acc_stderr": 0.03267862331014063, "acc_norm": 0.48936170212765956, "acc_norm_stderr": 0.03267862331014063 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4473684210526316, "acc_stderr": 0.046774730044911984, "acc_norm": 0.4473684210526316, "acc_norm_stderr": 0.046774730044911984 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.593103448275862, "acc_stderr": 0.04093793981266236, "acc_norm": 0.593103448275862, "acc_norm_stderr": 0.04093793981266236 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.38095238095238093, "acc_stderr": 0.025010749116137595, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.025010749116137595 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.38095238095238093, "acc_stderr": 0.04343525428949098, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.04343525428949098 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6064516129032258, "acc_stderr": 0.027791878753132274, "acc_norm": 0.6064516129032258, "acc_norm_stderr": 0.027791878753132274 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4729064039408867, "acc_stderr": 0.03512819077876106, "acc_norm": 0.4729064039408867, "acc_norm_stderr": 0.03512819077876106 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7515151515151515, "acc_stderr": 0.033744026441394036, "acc_norm": 0.7515151515151515, "acc_norm_stderr": 0.033744026441394036 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7272727272727273, "acc_stderr": 0.03173071239071724, "acc_norm": 0.7272727272727273, "acc_norm_stderr": 0.03173071239071724 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8341968911917098, "acc_stderr": 0.026839845022314415, "acc_norm": 0.8341968911917098, "acc_norm_stderr": 0.026839845022314415 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5384615384615384, "acc_stderr": 0.025275892070240648, "acc_norm": 0.5384615384615384, "acc_norm_stderr": 0.025275892070240648 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc":



