open-llm-leaderboard-old/details_luffycodes__vicuna-class-shishya-ac-hal-7b-ep3
收藏数据集概述
数据集简介
该数据集是在评估模型 luffycodes/vicuna-class-shishya-ac-hal-7b-ep3 在 Open LLM Leaderboard 上的运行过程中自动创建的。
数据集组成
- 该数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分片,分片名称使用运行的时间戳。
- "train" 分片始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_luffycodes__vicuna-class-shishya-ac-hal-7b-ep3", "harness_winogrande_5", split="train")
最新结果
以下是 2023-12-16T15:05:04.315196 运行的最新结果:
python { "all": { "acc": 0.504457394086061, "acc_stderr": 0.0339872992064852, "acc_norm": 0.5129744502613426, "acc_norm_stderr": 0.03491403416694853, "mc1": 0.2766217870257038, "mc1_stderr": 0.015659605755326923, "mc2": 0.43032912918561517, "mc2_stderr": 0.014997775568928156 }, "harness|arc:challenge|25": { "acc": 0.4249146757679181, "acc_stderr": 0.014445698968520769, "acc_norm": 0.4462457337883959, "acc_norm_stderr": 0.014526705548539982 }, "harness|hellaswag|10": { "acc": 0.5800637323242382, "acc_stderr": 0.004925394995490124, "acc_norm": 0.7697669786895041, "acc_norm_stderr": 0.004201215520808244 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.27, "acc_stderr": 0.044619604333847415, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847415 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5185185185185185, "acc_stderr": 0.043163785995113245, "acc_norm": 0.5185185185185185, "acc_norm_stderr": 0.043163785995113245 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4868421052631579, "acc_stderr": 0.04067533136309173, "acc_norm": 0.4868421052631579, "acc_norm_stderr": 0.04067533136309173 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.569811320754717, "acc_stderr": 0.030471445867183238, "acc_norm": 0.569811320754717, "acc_norm_stderr": 0.030471445867183238 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5, "acc_stderr": 0.04181210050035455, "acc_norm": 0.5, "acc_norm_stderr": 0.04181210050035455 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.31, "acc_stderr": 0.04648231987117317, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117317 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.45664739884393063, "acc_stderr": 0.03798106566014498, "acc_norm": 0.45664739884393063, "acc_norm_stderr": 0.03798106566014498 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.18627450980392157, "acc_stderr": 0.03873958714149352, "acc_norm": 0.18627450980392157, "acc_norm_stderr": 0.03873958714149352 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.65, "acc_stderr": 0.047937248544110196, "acc_norm": 0.65, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.425531914893617, "acc_stderr": 0.03232146916224468, "acc_norm": 0.425531914893617, "acc_norm_stderr": 0.03232146916224468 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.34210526315789475, "acc_stderr": 0.04462917535336936, "acc_norm": 0.34210526315789475, "acc_norm_stderr": 0.04462917535336936 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4896551724137931, "acc_stderr": 0.04165774775728763, "acc_norm": 0.4896551724137931, "acc_norm_stderr": 0.04165774775728763 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.335978835978836, "acc_stderr": 0.02432631052914915, "acc_norm": 0.335978835978836, "acc_norm_stderr": 0.02432631052914915 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303316, "acc_norm": 0.30952380952380953, "acc_norm_stderr": 0.04134913018303316 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.5612903225806452, "acc_stderr": 0.02822949732031722, "acc_norm": 0.5612903225806452, "acc_norm_stderr": 0.02822949732031722 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.41379310344827586, "acc_stderr": 0.03465304488406795, "acc_norm": 0.41379310344827586, "acc_norm_stderr": 0.03465304488406795 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6424242424242425, "acc_stderr": 0.037425970438065864, "acc_norm": 0.6424242424242425, "acc_norm_stderr": 0.037425970438065864 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.6111111111111112, "acc_stderr": 0.0347327959083696, "acc_norm": 0.6111111111111112, "acc_norm_stderr": 0.0347327959083696 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7564766839378239, "acc_stderr": 0.030975436386845457, "acc_norm": 0.7564766839378239, "acc_norm_stderr": 0.030975436386845457 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.4846153846153846, "acc_stderr": 0.025339003010106515, "acc_norm": 0.4846153846153846, "acc_norm_stderr": 0.025339003010106515 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.27037037037037037, "acc_stderr": 0.02708037281514566, "acc_norm": 0.27037037037037037,



