open-llm-leaderboard-old/details_TeeZee__GALAXY-XB-v.01
收藏数据集概述
数据集简介
该数据集是在对模型 TeeZee/GALAXY-XB-v.01 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集来源于 1 次运行,每次运行在每个配置中都有一个特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_TeeZee__GALAXY-XB-v.01", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-10T08:38:37.798892 运行的最新结果:
python { "all": { "acc": 0.6489294861573784, "acc_stderr": 0.031883763657466264, "acc_norm": 0.6533932757026093, "acc_norm_stderr": 0.03252706366243769, "mc1": 0.2766217870257038, "mc1_stderr": 0.015659605755326923, "mc2": 0.4367256901069689, "mc2_stderr": 0.014358645276062254 }, "harness|arc:challenge|25": { "acc": 0.5870307167235495, "acc_stderr": 0.014388344935398326, "acc_norm": 0.6092150170648464, "acc_norm_stderr": 0.01425856388051378 }, "harness|hellaswag|10": { "acc": 0.6401115315674168, "acc_stderr": 0.004789865379084518, "acc_norm": 0.8292172873929496, "acc_norm_stderr": 0.003755498941781852 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.41, "acc_stderr": 0.04943110704237103, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237103 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6074074074074074, "acc_stderr": 0.04218506215368879, "acc_norm": 0.6074074074074074, "acc_norm_stderr": 0.04218506215368879 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.743421052631579, "acc_stderr": 0.0355418036802569, "acc_norm": 0.743421052631579, "acc_norm_stderr": 0.0355418036802569 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.68, "acc_stderr": 0.046882617226215034, "acc_norm": 0.68, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.690566037735849, "acc_stderr": 0.02845015479411864, "acc_norm": 0.690566037735849, "acc_norm_stderr": 0.02845015479411864 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.03586879280080339, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.03586879280080339 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.55, "acc_stderr": 0.05, "acc_norm": 0.55, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6647398843930635, "acc_stderr": 0.03599586301247078, "acc_norm": 0.6647398843930635, "acc_norm_stderr": 0.03599586301247078 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.048971049527263666, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.048971049527263666 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816507, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816507 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6042553191489362, "acc_stderr": 0.031967586978353627, "acc_norm": 0.6042553191489362, "acc_norm_stderr": 0.031967586978353627 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4824561403508772, "acc_stderr": 0.04700708033551038, "acc_norm": 0.4824561403508772, "acc_norm_stderr": 0.04700708033551038 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6, "acc_stderr": 0.040824829046386284, "acc_norm": 0.6, "acc_norm_stderr": 0.040824829046386284 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4312169312169312, "acc_stderr": 0.025506481698138198, "acc_norm": 0.4312169312169312, "acc_norm_stderr": 0.025506481698138198 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4365079365079365, "acc_stderr": 0.04435932892851466, "acc_norm": 0.4365079365079365, "acc_norm_stderr": 0.04435932892851466 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8225806451612904, "acc_stderr": 0.02173254068932928, "acc_norm": 0.8225806451612904, "acc_norm_stderr": 0.02173254068932928 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.45320197044334976, "acc_stderr": 0.03502544650845872, "acc_norm": 0.45320197044334976, "acc_norm_stderr": 0.03502544650845872 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.63, "acc_stderr": 0.04852365870939098, "acc_norm": 0.63, "acc_norm_stderr": 0.04852365870939098 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7636363636363637, "acc_stderr": 0.03317505930009181, "acc_norm": 0.7636363636363637, "acc_norm_stderr": 0.03317505930009181 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8333333333333334, "acc_stderr": 0.026552207828215282, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.026552207828215282 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.917098445595855, "acc_stderr": 0.01989934131572178, "acc_norm": 0.917098445595855, "acc_norm_stderr": 0.01989934131572178 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.676923076923077, "acc_stderr": 0.023710888501970565, "acc_norm": 0.676923076923077, "acc_norm_stderr": 0.023710888501970565 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.31851851851851853, "acc_stderr": 0.02840653309060846, "acc_norm": 0.31851851851851853, "acc_norm_stderr": 0.02840653309060846 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6890756302521



