open-llm-leaderboard-old/details_lole25__phi-2-sft-ultrachat-full
收藏数据集概述
数据集简介
该数据集是在评估模型 lole25/phi-2-sft-ultrachat-full 在 Open LLM Leaderboard 上的自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。
- "train" 拆分始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_lole25__phi-2-sft-ultrachat-full", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-08T08:42:29.755628 运行的最新结果:
python { "all": { "acc": 0.5662955886806682, "acc_stderr": 0.033957871501758975, "acc_norm": 0.5677771978873812, "acc_norm_stderr": 0.03465461275789696, "mc1": 0.32068543451652387, "mc1_stderr": 0.016339170373280906, "mc2": 0.46064912372709454, "mc2_stderr": 0.015111152108942209 }, "harness|arc:challenge|25": { "acc": 0.5802047781569966, "acc_stderr": 0.014422181226303026, "acc_norm": 0.6083617747440273, "acc_norm_stderr": 0.014264122124938213 }, "harness|hellaswag|10": { "acc": 0.5589523999203346, "acc_stderr": 0.004954977202585475, "acc_norm": 0.7461661023700458, "acc_norm_stderr": 0.00434314254509425 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.045126085985421296, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421296 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.43703703703703706, "acc_stderr": 0.042849586397533994, "acc_norm": 0.43703703703703706, "acc_norm_stderr": 0.042849586397533994 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5460526315789473, "acc_stderr": 0.04051646342874142, "acc_norm": 0.5460526315789473, "acc_norm_stderr": 0.04051646342874142 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.55, "acc_stderr": 0.049999999999999996, "acc_norm": 0.55, "acc_norm_stderr": 0.049999999999999996 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5773584905660377, "acc_stderr": 0.03040233144576954, "acc_norm": 0.5773584905660377, "acc_norm_stderr": 0.03040233144576954 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.625, "acc_stderr": 0.04048439222695598, "acc_norm": 0.625, "acc_norm_stderr": 0.04048439222695598 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5722543352601156, "acc_stderr": 0.037724468575180276, "acc_norm": 0.5722543352601156, "acc_norm_stderr": 0.037724468575180276 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3137254901960784, "acc_stderr": 0.04617034827006718, "acc_norm": 0.3137254901960784, "acc_norm_stderr": 0.04617034827006718 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5361702127659574, "acc_stderr": 0.03260038511835771, "acc_norm": 0.5361702127659574, "acc_norm_stderr": 0.03260038511835771 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.39473684210526316, "acc_stderr": 0.045981880578165414, "acc_norm": 0.39473684210526316, "acc_norm_stderr": 0.045981880578165414 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5517241379310345, "acc_stderr": 0.04144311810878152, "acc_norm": 0.5517241379310345, "acc_norm_stderr": 0.04144311810878152 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.40476190476190477, "acc_stderr": 0.025279850397404907, "acc_norm": 0.40476190476190477, "acc_norm_stderr": 0.025279850397404907 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3888888888888889, "acc_stderr": 0.04360314860077459, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.04360314860077459 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.667741935483871, "acc_stderr": 0.026795560848122797, "acc_norm": 0.667741935483871, "acc_norm_stderr": 0.026795560848122797 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.47783251231527096, "acc_stderr": 0.035145285621750094, "acc_norm": 0.47783251231527096, "acc_norm_stderr": 0.035145285621750094 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.64, "acc_stderr": 0.048241815132442176, "acc_norm": 0.64, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6484848484848484, "acc_stderr": 0.037282069986826503, "acc_norm": 0.6484848484848484, "acc_norm_stderr": 0.037282069986826503 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7525252525252525, "acc_stderr": 0.030746300742124495, "acc_norm": 0.7525252525252525, "acc_norm_stderr": 0.030746300742124495 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7461139896373057, "acc_stderr": 0.0314102478056532, "acc_norm": 0.7461139896373057, "acc_norm_stderr": 0.0314102478056532 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5564102564102564, "acc_stderr": 0.0251891498947642, "acc_norm": 0.5564102564102564, "acc_norm_stderr": 0.0251891498947642 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3074074074074074, "acc_stderr": 0.028133252578815642, "acc_norm": 0.3074074074074074, "acc_norm_stderr":



