open-llm-leaderboard-old/details_abacusai__Fewshot-Metamath-OrcaVicuna-Mistral
收藏数据集概述
该数据集是在模型abacusai/Fewshot-Metamath-OrcaVicuna-Mistral的评估运行期间自动创建的,用于Open LLM Leaderboard。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建来源:数据集从1次运行中创建。每个运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。
- 最新结果:"train"分割始终指向最新的结果。
- 结果汇总:一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_abacusai__Fewshot-Metamath-OrcaVicuna-Mistral", "harness_winogrande_5", split="train")
最新结果
以下是2024-01-10T15:33:11.471365运行的最新结果:
python { "all": { "acc": 0.6202552921359215, "acc_stderr": 0.03264933582271829, "acc_norm": 0.6199993104526068, "acc_norm_stderr": 0.03332566222099852, "mc1": 0.3659730722154223, "mc1_stderr": 0.016862941684088365, "mc2": 0.5323239517078452, "mc2_stderr": 0.0151650266597209 }, "harness|arc:challenge|25": { "acc": 0.568259385665529, "acc_stderr": 0.014474591427196202, "acc_norm": 0.5964163822525598, "acc_norm_stderr": 0.014337158914268443 }, "harness|hellaswag|10": { "acc": 0.6259709221270663, "acc_stderr": 0.004828822920915222, "acc_norm": 0.8181637124078869, "acc_norm_stderr": 0.0038492126228151682 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.562962962962963, "acc_stderr": 0.042849586397534015, "acc_norm": 0.562962962962963, "acc_norm_stderr": 0.042849586397534015 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6644736842105263, "acc_stderr": 0.03842498559395268, "acc_norm": 0.6644736842105263, "acc_norm_stderr": 0.03842498559395268 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6716981132075471, "acc_stderr": 0.02890159361241178, "acc_norm": 0.6716981132075471, "acc_norm_stderr": 0.02890159361241178 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6736111111111112, "acc_stderr": 0.03921067198982266, "acc_norm": 0.6736111111111112, "acc_norm_stderr": 0.03921067198982266 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6416184971098265, "acc_stderr": 0.03656343653353159, "acc_norm": 0.6416184971098265, "acc_norm_stderr": 0.03656343653353159 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.35294117647058826, "acc_stderr": 0.04755129616062946, "acc_norm": 0.35294117647058826, "acc_norm_stderr": 0.04755129616062946 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.042295258468165065, "acc_norm": 0.77, "acc_norm_stderr": 0.042295258468165065 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5191489361702127, "acc_stderr": 0.03266204299064678, "acc_norm": 0.5191489361702127, "acc_norm_stderr": 0.03266204299064678 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.37719298245614036, "acc_stderr": 0.04559522141958216, "acc_norm": 0.37719298245614036, "acc_norm_stderr": 0.04559522141958216 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5517241379310345, "acc_stderr": 0.04144311810878152, "acc_norm": 0.5517241379310345, "acc_norm_stderr": 0.04144311810878152 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3862433862433862, "acc_stderr": 0.02507598176760168, "acc_norm": 0.3862433862433862, "acc_norm_stderr": 0.02507598176760168 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7677419354838709, "acc_stderr": 0.024022256130308235, "acc_norm": 0.7677419354838709, "acc_norm_stderr": 0.024022256130308235 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4876847290640394, "acc_stderr": 0.035169204442208966, "acc_norm": 0.4876847290640394, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7393939393939394, "acc_stderr": 0.034277431758165236, "acc_norm": 0.7393939393939394, "acc_norm_stderr": 0.034277431758165236 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586815, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586815 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8393782383419689, "acc_stderr": 0.02649905770139744, "acc_norm": 0.8393782383419689, "acc_norm_stderr": 0.02649905770139744 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6230769230769231, "acc_stderr": 0.024570975364225995, "acc_norm": 0.6230769230769231, "acc_norm_stderr": 0.024570975364225995 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3, "acc_stderr": 0.027940457136228412, "acc_norm": 0.3, "acc_norm_stderr": 0.027



