open-llm-leaderboard-old/details_argilla__notux-8x7b-v1-epoch-2
收藏数据集概述
数据集简介
该数据集是在评估模型 argilla/notux-8x7b-v1-epoch-2 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_argilla__notux-8x7b-v1-epoch-2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-06T07:23:08.510905 运行的最新结果:
python { "all": { "acc": 0.7132510295468097, "acc_stderr": 0.030137639590982482, "acc_norm": 0.7169084121358973, "acc_norm_stderr": 0.030719998582647873, "mc1": 0.5140758873929009, "mc1_stderr": 0.01749656371704278, "mc2": 0.6596774083234566, "mc2_stderr": 0.015018146932027448 }, "harness|arc:challenge|25": { "acc": 0.6808873720136519, "acc_stderr": 0.013621696119173304, "acc_norm": 0.7064846416382252, "acc_norm_stderr": 0.01330725044494111 }, "harness|hellaswag|10": { "acc": 0.6900019916351324, "acc_stderr": 0.0046154722103160396, "acc_norm": 0.8780123481378211, "acc_norm_stderr": 0.0032660269509226414 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.674074074074074, "acc_stderr": 0.040491220417025055, "acc_norm": 0.674074074074074, "acc_norm_stderr": 0.040491220417025055 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7960526315789473, "acc_stderr": 0.03279000406310049, "acc_norm": 0.7960526315789473, "acc_norm_stderr": 0.03279000406310049 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7773584905660378, "acc_stderr": 0.025604233470899095, "acc_norm": 0.7773584905660378, "acc_norm_stderr": 0.025604233470899095 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8402777777777778, "acc_stderr": 0.030635578972093278, "acc_norm": 0.8402777777777778, "acc_norm_stderr": 0.030635578972093278 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.62, "acc_stderr": 0.04878317312145632, "acc_norm": 0.62, "acc_norm_stderr": 0.04878317312145632 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7514450867052023, "acc_stderr": 0.03295304696818318, "acc_norm": 0.7514450867052023, "acc_norm_stderr": 0.03295304696818318 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.43137254901960786, "acc_stderr": 0.04928099597287534, "acc_norm": 0.43137254901960786, "acc_norm_stderr": 0.04928099597287534 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.82, "acc_stderr": 0.03861229196653695, "acc_norm": 0.82, "acc_norm_stderr": 0.03861229196653695 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6851063829787234, "acc_stderr": 0.03036358219723817, "acc_norm": 0.6851063829787234, "acc_norm_stderr": 0.03036358219723817 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5964912280701754, "acc_stderr": 0.04615186962583707, "acc_norm": 0.5964912280701754, "acc_norm_stderr": 0.04615186962583707 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6551724137931034, "acc_stderr": 0.03960933549451208, "acc_norm": 0.6551724137931034, "acc_norm_stderr": 0.03960933549451208 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.47354497354497355, "acc_stderr": 0.025715239811346758, "acc_norm": 0.47354497354497355, "acc_norm_stderr": 0.025715239811346758 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5158730158730159, "acc_stderr": 0.044698818540726076, "acc_norm": 0.5158730158730159, "acc_norm_stderr": 0.044698818540726076 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8516129032258064, "acc_stderr": 0.020222737554330378, "acc_norm": 0.8516129032258064, "acc_norm_stderr": 0.020222737554330378 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6157635467980296, "acc_stderr": 0.03422398565657551, "acc_norm": 0.6157635467980296, "acc_norm_stderr": 0.03422398565657551 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8, "acc_stderr": 0.03123475237772117, "acc_norm": 0.8, "acc_norm_stderr": 0.03123475237772117 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8686868686868687, "acc_stderr": 0.024063156416822523, "acc_norm": 0.8686868686868687, "acc_norm_stderr": 0.024063156416822523 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9585492227979274, "acc_stderr": 0.01438543285747646, "acc_norm": 0.9585492227979274, "acc_norm_stderr": 0.01438543285747646 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7025641025641025, "acc_stderr": 0.023177408131465946, "acc_norm": 0.7025641025641025, "acc_norm_stderr": 0.023177408131465946 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3814814814814815, "acc_stderr": 0.029616718927497582, "acc_norm": 0.3814814814814815, "acc_norm_stderr": 0.029616718927497582 }, "harness|hendrycksTest-high_school_microeconomics



