open-llm-leaderboard-old/details_JaeyeonKang__CCK_Gony_v0.2
收藏数据集概述
数据集简介
该数据集是在评估模型 JaeyeonKang/CCK_Gony_v0.2 在 Open LLM Leaderboard 上的自动创建的。
数据集组成
- 该数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_JaeyeonKang__CCK_Gony_v0.2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-02T07:55:59.515019 运行的最新结果:
python { "all": { "acc": 0.7010040167512416, "acc_stderr": 0.03046871961713868, "acc_norm": 0.705796779111703, "acc_norm_stderr": 0.03105821273167017, "mc1": 0.4479804161566707, "mc1_stderr": 0.017408513063422906, "mc2": 0.5946296550939176, "mc2_stderr": 0.01480851612780029 }, "harness|arc:challenge|25": { "acc": 0.6493174061433447, "acc_stderr": 0.013944635930726099, "acc_norm": 0.6885665529010239, "acc_norm_stderr": 0.013532472099850939 }, "harness|hellaswag|10": { "acc": 0.6690898227444733, "acc_stderr": 0.00469579134050288, "acc_norm": 0.8660625373431587, "acc_norm_stderr": 0.0033988905252296995 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6888888888888889, "acc_stderr": 0.03999262876617721, "acc_norm": 0.6888888888888889, "acc_norm_stderr": 0.03999262876617721 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.756578947368421, "acc_stderr": 0.034923496688842384, "acc_norm": 0.756578947368421, "acc_norm_stderr": 0.034923496688842384 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7622641509433963, "acc_stderr": 0.02619980880756193, "acc_norm": 0.7622641509433963, "acc_norm_stderr": 0.02619980880756193 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8055555555555556, "acc_stderr": 0.03309615177059006, "acc_norm": 0.8055555555555556, "acc_norm_stderr": 0.03309615177059006 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.67, "acc_stderr": 0.047258156262526066, "acc_norm": 0.67, "acc_norm_stderr": 0.047258156262526066 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7225433526011561, "acc_stderr": 0.03414014007044036, "acc_norm": 0.7225433526011561, "acc_norm_stderr": 0.03414014007044036 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.45098039215686275, "acc_stderr": 0.049512182523962625, "acc_norm": 0.45098039215686275, "acc_norm_stderr": 0.049512182523962625 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6127659574468085, "acc_stderr": 0.03184389265339526, "acc_norm": 0.6127659574468085, "acc_norm_stderr": 0.03184389265339526 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5614035087719298, "acc_stderr": 0.04668000738510455, "acc_norm": 0.5614035087719298, "acc_norm_stderr": 0.04668000738510455 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6413793103448275, "acc_stderr": 0.039966295748767186, "acc_norm": 0.6413793103448275, "acc_norm_stderr": 0.039966295748767186 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.49206349206349204, "acc_stderr": 0.025748065871673286, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.025748065871673286 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.48412698412698413, "acc_stderr": 0.04469881854072606, "acc_norm": 0.48412698412698413, "acc_norm_stderr": 0.04469881854072606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8516129032258064, "acc_stderr": 0.020222737554330378, "acc_norm": 0.8516129032258064, "acc_norm_stderr": 0.020222737554330378 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5665024630541872, "acc_stderr": 0.034867317274198714, "acc_norm": 0.5665024630541872, "acc_norm_stderr": 0.034867317274198714 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.76, "acc_stderr": 0.04292346959909281, "acc_norm": 0.76, "acc_norm_stderr": 0.04292346959909281 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7818181818181819, "acc_stderr": 0.03225078108306289, "acc_norm": 0.7818181818181819, "acc_norm_stderr": 0.03225078108306289 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8787878787878788, "acc_stderr": 0.023253157951942088, "acc_norm": 0.8787878787878788, "acc_norm_stderr": 0.023253157951942088 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9585492227979274, "acc_stderr": 0.01438543285747646, "acc_norm": 0.9585492227979274, "acc_norm_stderr": 0.01438543285747646 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7153846153846154, "acc_stderr": 0.0228783227997063, "acc_norm": 0.7153846153846154, "acc_norm_stderr": 0.0228783227997063 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.4, "acc_stderr": 0.0298696050953169, "acc_norm": 0.4, "acc_norm_stderr": 0.0298696050953169 }, "



