open-llm-leaderboard-old/details_stabilityai__stablelm-2-1_6b
收藏数据集概述
数据集简介
该数据集是在评估模型 stabilityai/stablelm-2-1_6b 的过程中自动创建的,用于 Open LLM Leaderboard 的评估。
数据集组成
- 该数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。
- 每个配置中的 "train" 分割总是指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_stabilityai__stablelm-2-1_6b", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-24T10:43:24.406547 运行的最新结果:
python { "all": { "acc": 0.3923741043833077, "acc_stderr": 0.03405640954935936, "acc_norm": 0.3955514306541472, "acc_norm_stderr": 0.03480165961817428, "mc1": 0.22031823745410037, "mc1_stderr": 0.014509045171487283, "mc2": 0.36783858238841727, "mc2_stderr": 0.013915102083485486 }, "harness|arc:challenge|25": { "acc": 0.3967576791808874, "acc_stderr": 0.014296513020180632, "acc_norm": 0.4334470989761092, "acc_norm_stderr": 0.014481376224558896 }, "harness|hellaswag|10": { "acc": 0.5185222067317268, "acc_stderr": 0.004986356526063975, "acc_norm": 0.7045409281019717, "acc_norm_stderr": 0.004553164013379557 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4444444444444444, "acc_stderr": 0.04292596718256981, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.04292596718256981 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.3157894736842105, "acc_stderr": 0.0378272898086547, "acc_norm": 0.3157894736842105, "acc_norm_stderr": 0.0378272898086547 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.3169811320754717, "acc_stderr": 0.028637235639800935, "acc_norm": 0.3169811320754717, "acc_norm_stderr": 0.028637235639800935 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.4305555555555556, "acc_stderr": 0.04140685639111503, "acc_norm": 0.4305555555555556, "acc_norm_stderr": 0.04140685639111503 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.31213872832369943, "acc_stderr": 0.035331333893236574, "acc_norm": 0.31213872832369943, "acc_norm_stderr": 0.035331333893236574 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.1568627450980392, "acc_stderr": 0.03618664819936248, "acc_norm": 0.1568627450980392, "acc_norm_stderr": 0.03618664819936248 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.3276595744680851, "acc_stderr": 0.030683020843231004, "acc_norm": 0.3276595744680851, "acc_norm_stderr": 0.030683020843231004 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2631578947368421, "acc_stderr": 0.04142439719489362, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.04142439719489362 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.503448275862069, "acc_stderr": 0.04166567577101579, "acc_norm": 0.503448275862069, "acc_norm_stderr": 0.04166567577101579 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2698412698412698, "acc_stderr": 0.022860838309232072, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.022860838309232072 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.24603174603174602, "acc_stderr": 0.038522733649243156, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.038522733649243156 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.28, "acc_stderr": 0.045126085985421276, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.3870967741935484, "acc_stderr": 0.027709359675032495, "acc_norm": 0.3870967741935484, "acc_norm_stderr": 0.027709359675032495 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.24630541871921183, "acc_stderr": 0.030315099285617732, "acc_norm": 0.24630541871921183, "acc_norm_stderr": 0.030315099285617732 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.503030303030303, "acc_stderr": 0.03904272341431856, "acc_norm": 0.503030303030303, "acc_norm_stderr": 0.03904272341431856 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.5050505050505051, "acc_stderr": 0.035621707606254015, "acc_norm": 0.5050505050505051, "acc_norm_stderr": 0.035621707606254015 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.46113989637305697, "acc_stderr": 0.03597524411734579, "acc_norm": 0.46113989637305697, "acc_norm_stderr": 0.03597524411734579 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.32564102564102565, "acc_stderr": 0.02375966576741229, "acc_norm": 0.32564102564102565, "acc_norm_stderr": 0.02375966576741229 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.26666666666666666, "acc_stderr": 0.02696242432507384, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.026



