open-llm-leaderboard/details_BEE-spoke-data__verysmol_llama-v11-KIx2
收藏数据集概述
数据集简介
该数据集是在模型 BEE-spoke-data/verysmol_llama-v11-KIx2 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 数据集包含 64 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_BEE-spoke-data__verysmol_llama-v11-KIx2_public", "harness_winogrande_5", split="train")
最新结果
以下是 2023-11-13T13:21:49.840481 运行的最新结果:
python { "all": { "acc": 0.25242844116774144, "acc_stderr": 0.030580549886448656, "acc_norm": 0.25279484630397214, "acc_norm_stderr": 0.03136408554761852, "mc1": 0.2521419828641371, "mc1_stderr": 0.015201522246299962, "mc2": 0.44749716634136827, "mc2_stderr": 0.015554683095212777, "em": 0.001153523489932886, "em_stderr": 0.0003476179896857093, "f1": 0.03032822986577186, "f1_stderr": 0.0010726730256709186 }, "harness|arc:challenge|25": { "acc": 0.19795221843003413, "acc_stderr": 0.011643990971573407, "acc_norm": 0.22696245733788395, "acc_norm_stderr": 0.012240491536132866 }, "harness|hellaswag|10": { "acc": 0.2698665604461263, "acc_stderr": 0.0044298311529146735, "acc_norm": 0.27604062935670187, "acc_norm_stderr": 0.004461235175488315 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.17, "acc_stderr": 0.03775251680686371, "acc_norm": 0.17, "acc_norm_stderr": 0.03775251680686371 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.26666666666666666, "acc_stderr": 0.038201699145179055, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.038201699145179055 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.17763157894736842, "acc_stderr": 0.031103182383123398, "acc_norm": 0.17763157894736842, "acc_norm_stderr": 0.031103182383123398 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2188679245283019, "acc_stderr": 0.02544786382510863, "acc_norm": 0.2188679245283019, "acc_norm_stderr": 0.02544786382510863 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2361111111111111, "acc_stderr": 0.03551446610810826, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.03551446610810826 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.23, "acc_stderr": 0.04229525846816508, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816508 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2023121387283237, "acc_stderr": 0.03063114553919882, "acc_norm": 0.2023121387283237, "acc_norm_stderr": 0.03063114553919882 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237654, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.21, "acc_stderr": 0.04093601807403326, "acc_norm": 0.21, "acc_norm_stderr": 0.04093601807403326 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.26382978723404255, "acc_stderr": 0.028809989854102973, "acc_norm": 0.26382978723404255, "acc_norm_stderr": 0.028809989854102973 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2719298245614035, "acc_stderr": 0.04185774424022057, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.04185774424022057 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.21379310344827587, "acc_stderr": 0.0341652044774755, "acc_norm": 0.21379310344827587, "acc_norm_stderr": 0.0341652044774755 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.25132275132275134, "acc_stderr": 0.022340482339643898, "acc_norm": 0.25132275132275134, "acc_norm_stderr": 0.022340482339643898 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.21428571428571427, "acc_stderr": 0.036700664510471825, "acc_norm": 0.21428571428571427, "acc_norm_stderr": 0.036700664510471825 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.14, "acc_stderr": 0.034873508801977704, "acc_norm": 0.14, "acc_norm_stderr": 0.034873508801977704 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.3096774193548387, "acc_stderr": 0.026302774983517418, "acc_norm": 0.3096774193548387, "acc_norm_stderr": 0.026302774983517418 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2955665024630542, "acc_stderr": 0.032104944337514575, "acc_norm": 0.2955665024630542, "acc_norm_stderr": 0.032104944337514575 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.2606060606060606, "acc_stderr": 0.034277431758165236, "acc_norm": 0.2606060606060606, "acc_norm_stderr": 0.034277431758165236 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.3383838383838384, "acc_stderr": 0.03371124142626304, "acc_norm": 0.3383838383838384, "acc_norm_stderr": 0.03371124142626304 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.27461139896373055, "acc_stderr": 0.03221024508041154, "acc_norm": 0.27461139896373055, "acc_norm_stderr": 0.03221024508041154 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.33076923076923076, "acc_stderr": 0.02385479568097



