open-llm-leaderboard-old/details_Weyaxi__Stellaris-internlm2-20b-r512
收藏数据集概述
数据集摘要
该数据集是在评估模型Weyaxi/Stellaris-internlm2-20b-r512在Open LLM Leaderboard上的运行过程中自动创建的。
数据集组成
- 数据集由63个配置组成,每个配置对应一个评估任务。
- 数据集是从1次运行中创建的。每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Weyaxi__Stellaris-internlm2-20b-r512", "harness_winogrande_5", split="train")
最新结果
这些是最新结果的示例: python { "all": { "acc": 0.6566278372965977, "acc_stderr": 0.03172928992616415, "acc_norm": 0.6659506224432014, "acc_norm_stderr": 0.03244356975655913, "mc1": 0.3157894736842105, "mc1_stderr": 0.01627228795791691, "mc2": 0.4950678335769212, "mc2_stderr": 0.015192417727874554 }, "harness|arc:challenge|25": { "acc": 0.590443686006826, "acc_stderr": 0.014370358632472437, "acc_norm": 0.6382252559726962, "acc_norm_stderr": 0.014041957945038076 }, "harness|hellaswag|10": { "acc": 0.6601274646484764, "acc_stderr": 0.00472697660713081, "acc_norm": 0.8399721171081458, "acc_norm_stderr": 0.0036588262081016093 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5925925925925926, "acc_stderr": 0.04244633238353227, "acc_norm": 0.5925925925925926, "acc_norm_stderr": 0.04244633238353227 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7763157894736842, "acc_stderr": 0.03391160934343603, "acc_norm": 0.7763157894736842, "acc_norm_stderr": 0.03391160934343603 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.72, "acc_stderr": 0.045126085985421276, "acc_norm": 0.72, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7358490566037735, "acc_stderr": 0.027134291628741713, "acc_norm": 0.7358490566037735, "acc_norm_stderr": 0.027134291628741713 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7916666666666666, "acc_stderr": 0.033961162058453336, "acc_norm": 0.7916666666666666, "acc_norm_stderr": 0.033961162058453336 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956911, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6820809248554913, "acc_stderr": 0.0355068398916558, "acc_norm": 0.6820809248554913, "acc_norm_stderr": 0.0355068398916558 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.45098039215686275, "acc_stderr": 0.04951218252396264, "acc_norm": 0.45098039215686275, "acc_norm_stderr": 0.04951218252396264 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036845, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036845 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6680851063829787, "acc_stderr": 0.030783736757745647, "acc_norm": 0.6680851063829787, "acc_norm_stderr": 0.030783736757745647 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.47368421052631576, "acc_stderr": 0.046970851366478626, "acc_norm": 0.47368421052631576, "acc_norm_stderr": 0.046970851366478626 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.593103448275862, "acc_stderr": 0.04093793981266237, "acc_norm": 0.593103448275862, "acc_norm_stderr": 0.04093793981266237 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.49206349206349204, "acc_stderr": 0.02574806587167329, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.02574806587167329 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4444444444444444, "acc_stderr": 0.04444444444444449, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.04444444444444449 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.28, "acc_stderr": 0.04512608598542127, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.832258064516129, "acc_stderr": 0.021255464065371325, "acc_norm": 0.832258064516129, "acc_norm_stderr": 0.021255464065371325 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5911330049261084, "acc_stderr": 0.03459058815883233, "acc_norm": 0.5911330049261084, "acc_norm_stderr": 0.03459058815883233 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.73, "acc_stderr": 0.044619604333847394, "acc_norm": 0.73, "acc_norm_stderr": 0.044619604333847394 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8, "acc_stderr": 0.031234752377721164, "acc_norm": 0.8, "acc_norm_stderr": 0.031234752377721164 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8282828282828283, "acc_stderr": 0.026869716187429914, "acc_norm": 0.8282828282828283, "acc_norm_stderr": 0.026869716187429914 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8756476683937824, "acc_stderr": 0.023814477086593552, "acc_norm": 0.8756476683937824, "acc_norm_stderr": 0.023814477086593552 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6923076923076923, "acc_stderr": 0.02340092891831049, "acc_norm": 0.6923076923076923, "acc_norm_stderr": 0.02340092891831049 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34814814814814815, "acc_stderr": 0.02904560029061627, "acc_norm": 0.34814814814814815, "acc_norm_stderr": 0.02904560029061627 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.7184873949579832, "acc_stderr": 0.02921354941437216, "acc_norm": 0.718487



