open-llm-leaderboard-old/details_GeneZC__MiniMA-3B
收藏数据集概述
数据集名称
Evaluation run of GeneZC/MiniMA-3B
数据集来源
该数据集是在对模型 GeneZC/MiniMA-3B 进行评估时自动创建的,评估结果发布在 Open LLM Leaderboard 上。
数据集结构
- 数据集包含 64 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_GeneZC__MiniMA-3B_public", "harness_winogrande_5", split="train")
最新结果
以下是 2023-11-14T07:13:08.636402 运行的最新结果:
python { "all": { "acc": 0.29408510608333527, "acc_stderr": 0.0320620383409799, "acc_norm": 0.2960643309987877, "acc_norm_stderr": 0.03284434924194973, "mc1": 0.24357405140758873, "mc1_stderr": 0.015026354824910782, "mc2": 0.3975501723358467, "mc2_stderr": 0.013807724938123454, "em": 0.0016778523489932886, "em_stderr": 0.00041913301788268467, "f1": 0.047221057046979924, "f1_stderr": 0.0012063604325880553 }, "harness|arc:challenge|25": { "acc": 0.3993174061433447, "acc_stderr": 0.014312094557946704, "acc_norm": 0.43430034129692835, "acc_norm_stderr": 0.014484703048857362 }, "harness|hellaswag|10": { "acc": 0.49990041824337783, "acc_stderr": 0.004989781312483212, "acc_norm": 0.6806413065126469, "acc_norm_stderr": 0.004652753439460152 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.25925925925925924, "acc_stderr": 0.03785714465066655, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.03785714465066655 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.28289473684210525, "acc_stderr": 0.03665349695640767, "acc_norm": 0.28289473684210525, "acc_norm_stderr": 0.03665349695640767 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.24, "acc_stderr": 0.04292346959909283, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2943396226415094, "acc_stderr": 0.028049186315695245, "acc_norm": 0.2943396226415094, "acc_norm_stderr": 0.028049186315695245 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2847222222222222, "acc_stderr": 0.03773809990686935, "acc_norm": 0.2847222222222222, "acc_norm_stderr": 0.03773809990686935 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2543352601156069, "acc_stderr": 0.0332055644308557, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.27450980392156865, "acc_stderr": 0.044405219061793254, "acc_norm": 0.27450980392156865, "acc_norm_stderr": 0.044405219061793254 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.225531914893617, "acc_stderr": 0.02732107841738753, "acc_norm": 0.225531914893617, "acc_norm_stderr": 0.02732107841738753 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2543859649122807, "acc_stderr": 0.040969851398436716, "acc_norm": 0.2543859649122807, "acc_norm_stderr": 0.040969851398436716 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4, "acc_stderr": 0.04082482904638628, "acc_norm": 0.4, "acc_norm_stderr": 0.04082482904638628 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2566137566137566, "acc_stderr": 0.022494510767503154, "acc_norm": 0.2566137566137566, "acc_norm_stderr": 0.022494510767503154 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.18253968253968253, "acc_stderr": 0.03455071019102149, "acc_norm": 0.18253968253968253, "acc_norm_stderr": 0.03455071019102149 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.33, "acc_stderr": 0.04725815626252603, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252603 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.267741935483871, "acc_stderr": 0.025189006660212385, "acc_norm": 0.267741935483871, "acc_norm_stderr": 0.025189006660212385 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.19704433497536947, "acc_stderr": 0.02798672466673621, "acc_norm": 0.19704433497536947, "acc_norm_stderr": 0.02798672466673621 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.30303030303030304, "acc_stderr": 0.03588624800091709, "acc_norm": 0.30303030303030304, "acc_norm_stderr": 0.03588624800091709 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.2727272727272727, "acc_stderr": 0.03173071239071724, "acc_norm": 0.2727272727272727, "acc_norm_stderr": 0.03173071239071724 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.31088082901554404, "acc_stderr": 0.03340361906276586, "acc_norm": 0.31088082901554404, "acc_norm_stderr": 0.03340361906276586 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.31025641025641026, "acc_stderr": 0.02345467488940429, "acc_norm": 0.31025641025641026, "acc_norm_stderr": 0.02345467488940429 }, "harness|hendry



