open-llm-leaderboard/details_PulsarAI__CollectiveCognition-v1.1-Nebula-7B
收藏数据集概述
数据集简介
该数据集是在对模型PulsarAI/CollectiveCognition-v1.1-Nebula-7B进行评估运行时自动创建的,评估运行在Open LLM Leaderboard上进行。
数据集组成
- 数据集包含64个配置,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_PulsarAI__CollectiveCognition-v1.1-Nebula-7B_public", "harness_winogrande_5", split="train")
最新结果
以下是2023-11-12T21:42:17.063541运行的最新结果:
python { "all": { "acc": 0.5655902624582015, "acc_stderr": 0.033540567370804734, "acc_norm": 0.5747445580416879, "acc_norm_stderr": 0.03431067576831402, "mc1": 0.38555691554467564, "mc1_stderr": 0.01703883901059167, "mc2": 0.5353024010333743, "mc2_stderr": 0.015743888224866397, "em": 0.35675335570469796, "em_stderr": 0.004905829488253491, "f1": 0.4216977768456382, "f1_stderr": 0.0047367493845716785 }, "harness|arc:challenge|25": { "acc": 0.5324232081911263, "acc_stderr": 0.014580637569995421, "acc_norm": 0.5810580204778157, "acc_norm_stderr": 0.014418106953639013 }, "harness|hellaswag|10": { "acc": 0.6309500099581756, "acc_stderr": 0.004815613144385404, "acc_norm": 0.8239394542919737, "acc_norm_stderr": 0.0038009327705977565 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5555555555555556, "acc_stderr": 0.04292596718256981, "acc_norm": 0.5555555555555556, "acc_norm_stderr": 0.04292596718256981 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5986842105263158, "acc_stderr": 0.03988903703336284, "acc_norm": 0.5986842105263158, "acc_norm_stderr": 0.03988903703336284 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.43, "acc_stderr": 0.04975698519562428, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6188679245283019, "acc_stderr": 0.029890609686286623, "acc_norm": 0.6188679245283019, "acc_norm_stderr": 0.029890609686286623 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6319444444444444, "acc_stderr": 0.040329990539607175, "acc_norm": 0.6319444444444444, "acc_norm_stderr": 0.040329990539607175 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.38, "acc_stderr": 0.04878317312145632, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.45, "acc_stderr": 0.049999999999999996, "acc_norm": 0.45, "acc_norm_stderr": 0.049999999999999996 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5433526011560693, "acc_stderr": 0.03798106566014498, "acc_norm": 0.5433526011560693, "acc_norm_stderr": 0.03798106566014498 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.30392156862745096, "acc_stderr": 0.04576665403207763, "acc_norm": 0.30392156862745096, "acc_norm_stderr": 0.04576665403207763 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.49361702127659574, "acc_stderr": 0.03268335899936337, "acc_norm": 0.49361702127659574, "acc_norm_stderr": 0.03268335899936337 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4473684210526316, "acc_stderr": 0.04677473004491199, "acc_norm": 0.4473684210526316, "acc_norm_stderr": 0.04677473004491199 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5448275862068965, "acc_stderr": 0.04149886942192117, "acc_norm": 0.5448275862068965, "acc_norm_stderr": 0.04149886942192117 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3915343915343915, "acc_stderr": 0.02513809138885108, "acc_norm": 0.3915343915343915, "acc_norm_stderr": 0.02513809138885108 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.38095238095238093, "acc_stderr": 0.04343525428949098, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.04343525428949098 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6483870967741936, "acc_stderr": 0.027162537826948458, "acc_norm": 0.6483870967741936, "acc_norm_stderr": 0.027162537826948458 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.45320197044334976, "acc_stderr": 0.03502544650845872, "acc_norm": 0.45320197044334976, "acc_norm_stderr": 0.03502544650845872 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.57, "acc_stderr": 0.04975698519562428, "acc_norm": 0.57, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7333333333333333, "acc_stderr": 0.03453131801885417, "acc_norm": 0.7333333333333333, "acc_norm_stderr": 0.03453131801885417 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7676767676767676, "acc_stderr": 0.030088629490217487, "acc_norm": 0.7676767676767676, "acc_norm_stderr": 0.030088629490217487 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8238341968911918, "acc_stderr": 0.02749350424454806, "acc_norm": 0.8238341968911918, "acc_norm_stderr": 0.02749350424454806 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5615384615384615, "acc_stderr": 0.025158266016868592, "acc_norm": 0.56153846153



