open-llm-leaderboard-old/details_NickyNicky__Mistral-7B-OpenOrca-oasst_top1_2023-08-25-v2
收藏数据集概述
数据集简介
该数据集是在对模型 NickyNicky/Mistral-7B-OpenOrca-oasst_top1_2023-08-25-v2 进行评估运行时自动创建的,评估结果发布在 Open LLM Leaderboard 上。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 运行次数:数据集来自1次运行。每个运行结果作为一个特定的分割(split)存储,分割名称使用运行的时间戳。
- 训练分割:"train" 分割始终指向最新的结果。
- 结果配置:一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_NickyNicky__Mistral-7B-OpenOrca-oasst_top1_2023-08-25-v2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-05T10:04:28.728094 运行 的最新结果:
python { "all": { "acc": 0.6213400817324534, "acc_stderr": 0.03263383145717375, "acc_norm": 0.6264588975889877, "acc_norm_stderr": 0.03329437822786877, "mc1": 0.3084455324357405, "mc1_stderr": 0.01616803938315687, "mc2": 0.46376151030867085, "mc2_stderr": 0.01457773326521732 }, "harness|arc:challenge|25": { "acc": 0.5597269624573379, "acc_stderr": 0.014506769524804232, "acc_norm": 0.6049488054607508, "acc_norm_stderr": 0.014285898292938165 }, "harness|hellaswag|10": { "acc": 0.6201951802429795, "acc_stderr": 0.0048434625459435, "acc_norm": 0.8206532563234415, "acc_norm_stderr": 0.0038285834080213836 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.046482319871173156, "acc_norm": 0.31, "acc_norm_stderr": 0.046482319871173156 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6074074074074074, "acc_stderr": 0.04218506215368879, "acc_norm": 0.6074074074074074, "acc_norm_stderr": 0.04218506215368879 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6710526315789473, "acc_stderr": 0.03823428969926605, "acc_norm": 0.6710526315789473, "acc_norm_stderr": 0.03823428969926605 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6641509433962264, "acc_stderr": 0.02906722014664483, "acc_norm": 0.6641509433962264, "acc_norm_stderr": 0.02906722014664483 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7361111111111112, "acc_stderr": 0.03685651095897532, "acc_norm": 0.7361111111111112, "acc_norm_stderr": 0.03685651095897532 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6127167630057804, "acc_stderr": 0.03714325906302065, "acc_norm": 0.6127167630057804, "acc_norm_stderr": 0.03714325906302065 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3137254901960784, "acc_stderr": 0.04617034827006717, "acc_norm": 0.3137254901960784, "acc_norm_stderr": 0.04617034827006717 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816505, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816505 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.548936170212766, "acc_stderr": 0.032529096196131965, "acc_norm": 0.548936170212766, "acc_norm_stderr": 0.032529096196131965 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.41228070175438597, "acc_stderr": 0.046306532033665956, "acc_norm": 0.41228070175438597, "acc_norm_stderr": 0.046306532033665956 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5241379310344828, "acc_stderr": 0.0416180850350153, "acc_norm": 0.5241379310344828, "acc_norm_stderr": 0.0416180850350153 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4126984126984127, "acc_stderr": 0.025355741263055263, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.025355741263055263 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3888888888888889, "acc_stderr": 0.04360314860077459, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.04360314860077459 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7483870967741936, "acc_stderr": 0.024685979286239976, "acc_norm": 0.7483870967741936, "acc_norm_stderr": 0.024685979286239976 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4975369458128079, "acc_stderr": 0.03517945038691063, "acc_norm": 0.4975369458128079, "acc_norm_stderr": 0.03517945038691063 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7515151515151515, "acc_stderr": 0.03374402644139403, "acc_norm": 0.7515151515151515, "acc_norm_stderr": 0.03374402644139403 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.797979797979798, "acc_stderr": 0.02860620428922987, "acc_norm": 0.797979797979798, "acc_norm_stderr": 0.02860620428922987 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8704663212435233, "acc_stderr": 0.02423353229775873, "acc_norm": 0.8704663212435233, "acc_norm_stderr": 0.02423353229775873 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.617948717948718, "acc_stderr": 0.024635549163908237, "acc_norm": 0.617948717948718, "acc_norm_stderr": 0.024635549163908237 }, "harness|hendrycksTest



