open-llm-leaderboard-old/details_giraffe176__Open_Hermes_Maid_Sam_Mistral_dtv0.1
收藏数据集概述
该数据集是在评估模型 giraffe176/Open_Hermes_Maid_Sam_Mistral_dtv0.1 在 Open LLM Leaderboard 上的运行过程中自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
数据集由 1 次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
此外,一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
以下是加载特定运行详细信息的示例代码:
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_giraffe176__Open_Hermes_Maid_Sam_Mistral_dtv0.1", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-19T07:25:14.448730 运行的最新结果:
python { "all": { "acc": 0.6492929541395905, "acc_stderr": 0.03204314290781419, "acc_norm": 0.6502356687496237, "acc_norm_stderr": 0.032693608758353816, "mc1": 0.41003671970624234, "mc1_stderr": 0.017217844717449325, "mc2": 0.5797198662912402, "mc2_stderr": 0.015180976093776475 }, "harness|arc:challenge|25": { "acc": 0.6390784982935154, "acc_stderr": 0.014034761386175456, "acc_norm": 0.6774744027303754, "acc_norm_stderr": 0.013659980894277366 }, "harness|hellaswag|10": { "acc": 0.6803425612427804, "acc_stderr": 0.004653907471785644, "acc_norm": 0.8638717386974706, "acc_norm_stderr": 0.003422238702226359 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6, "acc_stderr": 0.04232073695151589, "acc_norm": 0.6, "acc_norm_stderr": 0.04232073695151589 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6776315789473685, "acc_stderr": 0.03803510248351585, "acc_norm": 0.6776315789473685, "acc_norm_stderr": 0.03803510248351585 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7094339622641509, "acc_stderr": 0.027943219989337128, "acc_norm": 0.7094339622641509, "acc_norm_stderr": 0.027943219989337128 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.75, "acc_stderr": 0.03621034121889507, "acc_norm": 0.75, "acc_norm_stderr": 0.03621034121889507 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.38235294117647056, "acc_stderr": 0.04835503696107224, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.04835503696107224 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.042923469599092816, "acc_norm": 0.76, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5787234042553191, "acc_stderr": 0.03227834510146267, "acc_norm": 0.5787234042553191, "acc_norm_stderr": 0.03227834510146267 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5175438596491229, "acc_stderr": 0.04700708033551038, "acc_norm": 0.5175438596491229, "acc_norm_stderr": 0.04700708033551038 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5517241379310345, "acc_stderr": 0.04144311810878152, "acc_norm": 0.5517241379310345, "acc_norm_stderr": 0.04144311810878152 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42063492063492064, "acc_stderr": 0.025424835086924003, "acc_norm": 0.42063492063492064, "acc_norm_stderr": 0.025424835086924003 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4603174603174603, "acc_stderr": 0.04458029125470973, "acc_norm": 0.4603174603174603, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7838709677419354, "acc_stderr": 0.02341529343356852, "acc_norm": 0.7838709677419354, "acc_norm_stderr": 0.02341529343356852 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5123152709359606, "acc_stderr": 0.035169204442208966, "acc_norm": 0.5123152709359606, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.03192271569548301, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.03192271569548301 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586818, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586818 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9015544041450777, "acc_stderr": 0.021500249576033456, "acc_norm": 0.9015544041450777, "acc_norm_stderr": 0.021500249576033456 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.02394672474156398, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.02394672474156398 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.362962962962963, "acc_stderr": 0.02931820364520686, "acc_norm": 0.362962962962963, "acc_norm_stderr": 0.02931820364520686 }, "harness|hendrycksTest-high_school_microeconomics|5": {



