open-llm-leaderboard-old/details_minghaowu__phi-2-OpenHermes-2.5
收藏数据集概述
数据集来源
该数据集是在评估模型 minghaowu/phi-2-OpenHermes-2.5 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
数据集包含 63 个配置,每个配置对应一个评估任务。数据集从 2 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_minghaowu__phi-2-OpenHermes-2.5", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果来自 2024-02-05T03:58:10.319056:
python { "all": { "acc": 0.5417779286978951, "acc_stderr": 0.03387715104203166, "acc_norm": 0.5515584103266279, "acc_norm_stderr": 0.03480262412206108, "mc1": 0.33047735618115054, "mc1_stderr": 0.0164667696136983, "mc2": 0.48101216272366176, "mc2_stderr": 0.014866987436710487 }, "harness|arc:challenge|25": { "acc": 0.5332764505119454, "acc_stderr": 0.0145789958596058, "acc_norm": 0.5648464163822525, "acc_norm_stderr": 0.014487986197186043 }, "harness|hellaswag|10": { "acc": 0.5468034256124278, "acc_stderr": 0.004967872475383275, "acc_norm": 0.738797052380004, "acc_norm_stderr": 0.00438392514747874 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.046482319871173156, "acc_norm": 0.31, "acc_norm_stderr": 0.046482319871173156 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4666666666666667, "acc_stderr": 0.043097329010363554, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.043097329010363554 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5460526315789473, "acc_stderr": 0.04051646342874143, "acc_norm": 0.5460526315789473, "acc_norm_stderr": 0.04051646342874143 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5660377358490566, "acc_stderr": 0.030503292013342592, "acc_norm": 0.5660377358490566, "acc_norm_stderr": 0.030503292013342592 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5902777777777778, "acc_stderr": 0.04112490974670787, "acc_norm": 0.5902777777777778, "acc_norm_stderr": 0.04112490974670787 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5202312138728323, "acc_stderr": 0.03809342081273956, "acc_norm": 0.5202312138728323, "acc_norm_stderr": 0.03809342081273956 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3627450980392157, "acc_stderr": 0.047840607041056527, "acc_norm": 0.3627450980392157, "acc_norm_stderr": 0.047840607041056527 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.502127659574468, "acc_stderr": 0.03268572658667492, "acc_norm": 0.502127659574468, "acc_norm_stderr": 0.03268572658667492 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3508771929824561, "acc_stderr": 0.044895393502707, "acc_norm": 0.3508771929824561, "acc_norm_stderr": 0.044895393502707 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4827586206896552, "acc_stderr": 0.041641887201693775, "acc_norm": 0.4827586206896552, "acc_norm_stderr": 0.041641887201693775 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4074074074074074, "acc_stderr": 0.025305906241590632, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.025305906241590632 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3888888888888889, "acc_stderr": 0.04360314860077459, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.04360314860077459 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6516129032258065, "acc_stderr": 0.02710482632810094, "acc_norm": 0.6516129032258065, "acc_norm_stderr": 0.02710482632810094 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4482758620689655, "acc_stderr": 0.034991131376767445, "acc_norm": 0.4482758620689655, "acc_norm_stderr": 0.034991131376767445 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6363636363636364, "acc_stderr": 0.03756335775187898, "acc_norm": 0.6363636363636364, "acc_norm_stderr": 0.03756335775187898 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7121212121212122, "acc_stderr": 0.03225883512300992, "acc_norm": 0.7121212121212122, "acc_norm_stderr": 0.03225883512300992 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7461139896373057, "acc_stderr": 0.03141024780565322, "acc_norm": 0.7461139896373057, "acc_norm_stderr": 0.03141024780565322 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5384615384615384, "acc_stderr": 0.025275892070240644, "acc_norm": 0.5384615384615384, "acc_norm_stderr": 0.025275892070240644 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2814814814814815, "acc_stderr": 0.027420019350945277, "acc_norm": 0.2814814814814815, "acc_norm_stderr":



