open-llm-leaderboard/details_krevas__LDCC-Instruct-Llama-2-ko-13B-v2
收藏数据集概述
数据集简介
该数据集是在评估模型 krevas/LDCC-Instruct-Llama-2-ko-13B-v2 在 Open LLM Leaderboard 上的自动创建的。
数据集组成
- 该数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_krevas__LDCC-Instruct-Llama-2-ko-13B-v2", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是 2023-10-10T06:04:26.663902 运行的最新结果:
python { "all": { "acc": 0.45958883488115343, "acc_stderr": 0.034511714778603424, "acc_norm": 0.4636864222606454, "acc_norm_stderr": 0.03449288105358144, "mc1": 0.2668298653610771, "mc1_stderr": 0.015483691939237265, "mc2": 0.39776112473254976, "mc2_stderr": 0.013677730634490858 }, "harness|arc:challenge|25": { "acc": 0.5298634812286689, "acc_stderr": 0.014585305840007105, "acc_norm": 0.5639931740614335, "acc_norm_stderr": 0.014491225699230916 }, "harness|hellaswag|10": { "acc": 0.6105357498506274, "acc_stderr": 0.004866322258335963, "acc_norm": 0.8181637124078869, "acc_norm_stderr": 0.0038492126228151717 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.43703703703703706, "acc_stderr": 0.04284958639753399, "acc_norm": 0.43703703703703706, "acc_norm_stderr": 0.04284958639753399 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4342105263157895, "acc_stderr": 0.040335656678483205, "acc_norm": 0.4342105263157895, "acc_norm_stderr": 0.040335656678483205 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.46037735849056605, "acc_stderr": 0.030676096599389188, "acc_norm": 0.46037735849056605, "acc_norm_stderr": 0.030676096599389188 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5486111111111112, "acc_stderr": 0.041614023984032786, "acc_norm": 0.5486111111111112, "acc_norm_stderr": 0.041614023984032786 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.27, "acc_stderr": 0.0446196043338474, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.3352601156069364, "acc_stderr": 0.03599586301247077, "acc_norm": 0.3352601156069364, "acc_norm_stderr": 0.03599586301247077 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.17647058823529413, "acc_stderr": 0.03793281185307809, "acc_norm": 0.17647058823529413, "acc_norm_stderr": 0.03793281185307809 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.3617021276595745, "acc_stderr": 0.03141082197596239, "acc_norm": 0.3617021276595745, "acc_norm_stderr": 0.03141082197596239 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2631578947368421, "acc_stderr": 0.04142439719489359, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.04142439719489359 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.3103448275862069, "acc_stderr": 0.03855289616378948, "acc_norm": 0.3103448275862069, "acc_norm_stderr": 0.03855289616378948 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.24074074074074073, "acc_stderr": 0.0220190800122179, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.0220190800122179 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.23015873015873015, "acc_stderr": 0.03764950879790606, "acc_norm": 0.23015873015873015, "acc_norm_stderr": 0.03764950879790606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.27, "acc_stderr": 0.044619604333847415, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847415 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5096774193548387, "acc_stderr": 0.02843867799890955, "acc_norm": 0.5096774193548387, "acc_norm_stderr": 0.02843867799890955 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3399014778325123, "acc_stderr": 0.033327690684107895, "acc_norm": 0.3399014778325123, "acc_norm_stderr": 0.033327690684107895 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.5818181818181818, "acc_stderr": 0.03851716319398395, "acc_norm": 0.5818181818181818, "acc_norm_stderr": 0.03851716319398395 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5404040404040404, "acc_stderr": 0.035507024651313425, "acc_norm": 0.5404040404040404, "acc_norm_stderr": 0.035507024651313425 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.689119170984456, "acc_stderr": 0.033403619062765864, "acc_norm": 0.689119170984456, "acc_norm_stderr": 0.033403619062765864 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.40512820512820513, "acc_stderr": 0.024890471769938145, "acc_norm": 0.40512820512820513, "acc_norm_stderr": 0.024890471769938145 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.27037037037037037, "acc_stderr": 0



