open-llm-leaderboard-old/details_pansophic__new_model_test2
收藏数据集概述
该数据集是在模型 pansophic/new_model_test2 的评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_pansophic__new_model_test2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-01T11:44:05.916063 运行的最新结果:
python { "all": { "acc": 0.5638420021216752, "acc_stderr": 0.03378324751322572, "acc_norm": 0.5645972727751363, "acc_norm_stderr": 0.034480506272325, "mc1": 0.3243574051407589, "mc1_stderr": 0.01638797677964794, "mc2": 0.4654289715654586, "mc2_stderr": 0.015388256943129463 }, "harness|arc:challenge|25": { "acc": 0.5836177474402731, "acc_stderr": 0.01440561827943618, "acc_norm": 0.6203071672354948, "acc_norm_stderr": 0.014182119866974867 }, "harness|hellaswag|10": { "acc": 0.570902210714997, "acc_stderr": 0.00493935814556132, "acc_norm": 0.7536347341167098, "acc_norm_stderr": 0.0043001312233406945 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.045126085985421296, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421296 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.45185185185185184, "acc_stderr": 0.04299268905480864, "acc_norm": 0.45185185185185184, "acc_norm_stderr": 0.04299268905480864 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5592105263157895, "acc_stderr": 0.04040311062490436, "acc_norm": 0.5592105263157895, "acc_norm_stderr": 0.04040311062490436 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.049431107042371025, "acc_norm": 0.59, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.569811320754717, "acc_stderr": 0.030471445867183238, "acc_norm": 0.569811320754717, "acc_norm_stderr": 0.030471445867183238 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6458333333333334, "acc_stderr": 0.039994111357535424, "acc_norm": 0.6458333333333334, "acc_norm_stderr": 0.039994111357535424 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5664739884393064, "acc_stderr": 0.037786210790920566, "acc_norm": 0.5664739884393064, "acc_norm_stderr": 0.037786210790920566 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.29411764705882354, "acc_stderr": 0.04533838195929775, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.04533838195929775 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.04408440022768077, "acc_norm": 0.74, "acc_norm_stderr": 0.04408440022768077 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4765957446808511, "acc_stderr": 0.03265019475033582, "acc_norm": 0.4765957446808511, "acc_norm_stderr": 0.03265019475033582 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.044346007015849245, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.044346007015849245 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5241379310344828, "acc_stderr": 0.0416180850350153, "acc_norm": 0.5241379310344828, "acc_norm_stderr": 0.0416180850350153 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41005291005291006, "acc_stderr": 0.025331202438944427, "acc_norm": 0.41005291005291006, "acc_norm_stderr": 0.025331202438944427 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.42063492063492064, "acc_stderr": 0.04415438226743744, "acc_norm": 0.42063492063492064, "acc_norm_stderr": 0.04415438226743744 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6451612903225806, "acc_stderr": 0.027218889773308753, "acc_norm": 0.6451612903225806, "acc_norm_stderr": 0.027218889773308753 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.42857142857142855, "acc_stderr": 0.034819048444388045, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.034819048444388045 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.62, "acc_stderr": 0.04878317312145632, "acc_norm": 0.62, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6424242424242425, "acc_stderr": 0.03742597043806585, "acc_norm": 0.6424242424242425, "acc_norm_stderr": 0.03742597043806585 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7323232323232324, "acc_stderr": 0.03154449888270285, "acc_norm": 0.7323232323232324, "acc_norm_stderr": 0.03154449888270285 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7616580310880829, "acc_stderr": 0.030748905363909892, "acc_norm": 0.7616580310880829, "acc_norm_stderr": 0.030748905363909892 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5384615384615384, "acc_stderr": 0.025275892070240644, "acc_norm": 0.5384615384615384, "acc_norm_stderr": 0.025275892070240644 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3148148148148148, "acc_stderr": 0.028317533496066475, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.0



