open-llm-leaderboard-old/details_Weyaxi__Einstein-v5-v0.2-7B
收藏数据集概述
数据集简介
该数据集是在评估模型Weyaxi/Einstein-v5-v0.2-7B在Open LLM Leaderboard上的运行过程中自动创建的。
数据集组成
数据集由63个配置组成,每个配置对应一个评估任务。数据集是从1次运行中创建的,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train"分割始终指向最新的结果。
额外配置
一个额外的配置"results"存储了所有运行的聚合结果,用于计算并在Open LLM Leaderboard上显示聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Weyaxi__Einstein-v5-v0.2-7B", "harness_winogrande_5", split="train")
最新结果
以下是2024-03-27T21:09:37.228677运行的最新结果:
python { "all": { "acc": 0.612286564752706, "acc_stderr": 0.032839983165383065, "acc_norm": 0.6135779860343825, "acc_norm_stderr": 0.03350956178751591, "mc1": 0.3537331701346389, "mc1_stderr": 0.016737814358846147, "mc2": 0.5259333753586267, "mc2_stderr": 0.015070357329952046 }, "harness|arc:challenge|25": { "acc": 0.5691126279863481, "acc_stderr": 0.014471133392642463, "acc_norm": 0.6092150170648464, "acc_norm_stderr": 0.01425856388051378 }, "harness|hellaswag|10": { "acc": 0.6148177653853814, "acc_stderr": 0.004856437955719861, "acc_norm": 0.8098984266082454, "acc_norm_stderr": 0.003915792315457802 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5481481481481482, "acc_stderr": 0.042992689054808644, "acc_norm": 0.5481481481481482, "acc_norm_stderr": 0.042992689054808644 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6644736842105263, "acc_stderr": 0.03842498559395269, "acc_norm": 0.6644736842105263, "acc_norm_stderr": 0.03842498559395269 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.54, "acc_stderr": 0.05009082659620333, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.660377358490566, "acc_stderr": 0.029146904747798328, "acc_norm": 0.660377358490566, "acc_norm_stderr": 0.029146904747798328 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7083333333333334, "acc_stderr": 0.038009680605548594, "acc_norm": 0.7083333333333334, "acc_norm_stderr": 0.038009680605548594 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5606936416184971, "acc_stderr": 0.037842719328874674, "acc_norm": 0.5606936416184971, "acc_norm_stderr": 0.037842719328874674 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3235294117647059, "acc_stderr": 0.04655010411319619, "acc_norm": 0.3235294117647059, "acc_norm_stderr": 0.04655010411319619 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.042923469599092816, "acc_norm": 0.76, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5319148936170213, "acc_stderr": 0.03261936918467383, "acc_norm": 0.5319148936170213, "acc_norm_stderr": 0.03261936918467383 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4649122807017544, "acc_stderr": 0.046920083813689104, "acc_norm": 0.4649122807017544, "acc_norm_stderr": 0.046920083813689104 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5103448275862069, "acc_stderr": 0.04165774775728763, "acc_norm": 0.5103448275862069, "acc_norm_stderr": 0.04165774775728763 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41534391534391535, "acc_stderr": 0.025379524910778394, "acc_norm": 0.41534391534391535, "acc_norm_stderr": 0.025379524910778394 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42857142857142855, "acc_stderr": 0.04426266681379909, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.04426266681379909 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.635483870967742, "acc_stderr": 0.027379871229943245, "acc_norm": 0.635483870967742, "acc_norm_stderr": 0.027379871229943245 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4729064039408867, "acc_stderr": 0.03512819077876106, "acc_norm": 0.4729064039408867, "acc_norm_stderr": 0.03512819077876106 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.67, "acc_stderr": 0.047258156262526066, "acc_norm": 0.67, "acc_norm_stderr": 0.047258156262526066 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7757575757575758, "acc_stderr": 0.032568666616811015, "acc_norm": 0.7757575757575758, "acc_norm_stderr": 0.032568666616811015 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7626262626262627, "acc_stderr": 0.03031371053819889, "acc_norm": 0.7626262626262627, "acc_norm_stderr": 0.03031371053819889 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8549222797927462, "acc_stderr": 0.025416343096306422, "acc_norm": 0.8549222797927462, "acc_norm_stderr": 0.025416343096306422 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5974358974358974, "acc_stderr": 0.02486499515976775, "acc_norm": 0.5974358974358974, "acc_norm_stderr": 0.02486499515976775 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3, "acc_stderr": 0.027940457136228416, "acc_norm": 0.3, "acc_norm_stderr": 0.027940457136228416 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6302521008403361, "acc_stderr": 0.03135709599613591, "acc_norm": 0.6302521008403361, "acc_norm_stderr": 0.03135709599613591 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.31125827814569534, "acc_stderr": 0.03780445850526732, "acc_norm": 0.31125827814569534, "acc_norm_stderr": 0.03780445850526732 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.7944954128440367, "acc_stderr": 0.017324352325016015, "acc_norm": 0.7944954128440367, "acc_norm_stderr": 0.017324352325016015 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.44907407407407407, "acc_stderr": 0.03392238405321616, "acc_norm": 0.44907407407407407, "acc_norm_stderr": 0.03392238405321616 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8235294117647058, "acc_stderr": 0.026756401538078966, "acc_norm": 0.8235294117647058, "acc_norm_stderr": 0.026756401538078966 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7848101265822784, "acc_stderr": 0.026750826994676187, "acc_norm": 0.7848101265822784, "acc_norm_stderr": 0.026750826994676187 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6547085201793722, "acc_stderr": 0.03191100192835794, "acc_norm": 0.6547085201793722, "acc_norm_stderr": 0.03191100192835794 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7251908396946565, "acc_stderr": 0.03915345408847836, "acc_norm": 0.7251908396946565, "acc_norm_stderr": 0.03915345408847836 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7603305785123967, "acc_stderr": 0.03896878985070416, "acc_norm": 0.7603305785123967, "acc_norm_stderr": 0.03896878985070416 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7870370370370371, "acc_stderr": 0.0395783547198098, "acc_norm": 0.7870370370370371, "acc_norm_stderr": 0.0395783547198098 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7239263803680982, "acc_stderr": 0.03512385283705048, "acc_norm": 0.7239263803680982, "acc_norm_stderr": 0.03512385283705048 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.4642857142857143, "acc_stderr": 0.04733667890053756, "acc_norm": 0.4642857142857143, "acc_norm_stderr": 0.04733667890053756 }, "harness|hendrycksTest-management|5": { "acc": 0.7864077669902912, "acc_stderr": 0.040580420156460344, "acc_norm": 0.7864077669902912, "acc_norm_stderr": 0.040580420156460344 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8547008547008547, "acc_stderr": 0.02308663508684141, "acc_norm": 0.8547008547008547, "acc_norm_stderr": 0.02308663508684141 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.7931034482758621, "acc_stderr": 0.014485656041669178, "acc_norm": 0.7931034482758621, "acc_norm_stderr": 0.014485656041669178 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.6965317919075145, "acc_stderr": 0.02475241196091721, "acc_norm": 0.6965317919075145, "acc_norm_stderr": 0.02475241196091721 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.2759776536312849, "acc_stderr": 0.014950103002475358, "acc_norm": 0.2759776536312849, "acc_norm_stderr": 0.014950103002475358 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7254901960784313, "acc_stderr": 0.025553169991826517, "acc_norm": 0.7254901960784313, "acc_norm_stderr": 0.025553169991826517 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6913183279742765, "acc_stderr": 0.026236965881153266, "acc_norm": 0.6913183279742765, "acc_norm_stderr": 0.026236965881153266 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7129629629629629, "acc_stderr": 0.02517104191530968, "acc_norm": 0.7129629629629629, "acc_norm_stderr": 0.02517104191530968 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.44680851063829785, "acc_stderr": 0.029658235097666907, "acc_norm": 0.44680851063829785, "acc_norm_stderr": 0.029658235097666907 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.4621903520208605, "acc_stderr": 0.012733671880342507, "acc_norm": 0.4621903520208605, "acc_norm_stderr": 0.012733671880342507 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.5845588235294118, "acc_stderr": 0.029935342707877746, "acc_norm": 0.5845588235294118, "acc_norm_stderr": 0.029935342707877746 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6568627450980392, "acc_stderr": 0.019206606848825362, "acc_norm": 0.6568627450980392, "acc_norm_stderr": 0.019206606848825362 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6363636363636364, "acc_stderr": 0.04607582090719976, "acc_norm": 0.6363636363636364, "acc_norm_stderr": 0.04607582090719976 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7346938775510204, "acc_stderr": 0.028263889943784593, "acc_norm": 0.7346938775510204, "acc_norm_stderr": 0.028263889943784593 }, "harness|hendrycksTest-sociology|5": { "acc": 0.7860696517412935, "acc_stderr": 0.028996909693328913, "acc_norm": 0.7860696517412935, "acc_norm_stderr": 0.028996909693328913 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.85, "acc_stderr": 0.03588702812826369, "acc_norm": 0.85, "acc_norm_stderr": 0.03588702812826369 }, "harness|hendrycksTest-virology|5": { "acc": 0.4879518072289157, "acc_stderr": 0.03891364495835821, "acc_norm": 0.4879518072289157, "acc_norm_stderr": 0.03891364495835821 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.7953216374269005, "acc_stderr": 0.030944459778533207, "acc_norm": 0.7953216374269005, "acc_norm_stderr": 0.030944459778533207 }, "harness|truthfulqa:mc|0": { "mc1": 0.3537331701346389, "mc1_stderr": 0.016737814358846147, "mc2": 0.5259333753586267, "mc2_stderr": 0.015070357329952046 }, "harness|winogrande|5": { "acc": 0.7868981846882399, "acc_stderr": 0.011508957690722769 }, "harness|gsm8k|5": { "acc": 0.5966641394996209, "acc_stderr": 0.01351265478181471 } }
配置详情
-
config_name: harness_arc_challenge_25
- 分割: 2024_03_27T21_09_37.228677
- 路径:
**/details_harness|arc:challenge|25_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|arc:challenge|25_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: 2024_03_27T21_09_37.228677
-
config_name: harness_gsm8k_5
- 分割: 2024_03_27T21_09_37.228677
- 路径:
**/details_harness|gsm8k|5_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|gsm8k|5_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: 2024_03_27T21_09_37.228677
-
config_name: harness_hellaswag_10
- 分割: 2024_03_27T21_09_37.228677
- 路径:
**/details_harness|hellaswag|10_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: latest
- 路径:
**/details_harness|hellaswag|10_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: 2024_03_27T21_09_37.228677
-
config_name: harness_hendrycksTest_5
- 分割: 2024_03_27T21_09_37.228677
- 路径:
**/details_harness|hendrycksTest-abstract_algebra|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-anatomy|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-astronomy|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-business_ethics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-clinical_knowledge|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-college_biology|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-college_chemistry|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-college_computer_science|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-college_mathematics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-college_medicine|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-college_physics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-computer_security|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-conceptual_physics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-econometrics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-electrical_engineering|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-elementary_mathematics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-formal_logic|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-global_facts|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_biology|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_chemistry|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_computer_science|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_european_history|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_geography|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_government_and_politics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_macroeconomics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_mathematics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_microeconomics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_physics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_psychology|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_statistics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_us_history|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-high_school_world_history|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-human_aging|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-human_sexuality|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-international_law|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-jurisprudence|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-logical_fallacies|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-machine_learning|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-management|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-marketing|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-medical_genetics|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-miscellaneous|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-moral_disputes|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-moral_scenarios|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-nutrition|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-philosophy|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-prehistory|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-professional_accounting|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-professional_law|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-professional_medicine|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-professional_psychology|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-public_relations|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-security_studies|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-sociology|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-us_foreign_policy|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-virology|5_2024-03-27T21-09-37.228677.parquet**/details_harness|hendrycksTest-world_religions|5_2024-03-27T21-09-37.228677.parquet
- 路径:
- 分割: 2024_03_27T21_09_37.228677



