open-llm-leaderboard-old/details_NeuralNovel__Aeryth-7B-v0.1
收藏数据集概述
数据集简介
该数据集是在对模型NeuralNovel/Aeryth-7B-v0.1进行评估运行期间自动创建的,用于Open LLM Leaderboard。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 运行次数:数据集由4次运行创建,每次运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- 训练分割:"train"分割始终指向最新的结果。
- 结果配置:一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_NeuralNovel__Aeryth-7B-v0.1", "harness_winogrande_5", split="train")
最新结果
以下是2024-01-14T12:31:11.639995运行的最新结果:
python { "all": { "acc": 0.607832340017972, "acc_stderr": 0.033171072669556316, "acc_norm": 0.6134606437151463, "acc_norm_stderr": 0.03384290514267795, "mc1": 0.4602203182374541, "mc1_stderr": 0.01744801722396088, "mc2": 0.6357466374094296, "mc2_stderr": 0.015661867399479723 }, "harness|arc:challenge|25": { "acc": 0.5631399317406144, "acc_stderr": 0.014494421584256524, "acc_norm": 0.6032423208191127, "acc_norm_stderr": 0.014296513020180646 }, "harness|hellaswag|10": { "acc": 0.6514638518223461, "acc_stderr": 0.004755329243976671, "acc_norm": 0.835291774546903, "acc_norm_stderr": 0.0037015895712743134 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.04605661864718381, "acc_norm": 0.3, "acc_norm_stderr": 0.04605661864718381 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5851851851851851, "acc_stderr": 0.04256193767901408, "acc_norm": 0.5851851851851851, "acc_norm_stderr": 0.04256193767901408 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.631578947368421, "acc_stderr": 0.03925523381052932, "acc_norm": 0.631578947368421, "acc_norm_stderr": 0.03925523381052932 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6792452830188679, "acc_stderr": 0.028727502957880267, "acc_norm": 0.6792452830188679, "acc_norm_stderr": 0.028727502957880267 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6666666666666666, "acc_stderr": 0.03942082639927213, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.03942082639927213 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956911, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5664739884393064, "acc_stderr": 0.03778621079092056, "acc_norm": 0.5664739884393064, "acc_norm_stderr": 0.03778621079092056 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4215686274509804, "acc_stderr": 0.04913595201274498, "acc_norm": 0.4215686274509804, "acc_norm_stderr": 0.04913595201274498 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5361702127659574, "acc_stderr": 0.032600385118357715, "acc_norm": 0.5361702127659574, "acc_norm_stderr": 0.032600385118357715 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.43859649122807015, "acc_stderr": 0.04668000738510455, "acc_norm": 0.43859649122807015, "acc_norm_stderr": 0.04668000738510455 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6206896551724138, "acc_stderr": 0.04043461861916747, "acc_norm": 0.6206896551724138, "acc_norm_stderr": 0.04043461861916747 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.36772486772486773, "acc_stderr": 0.024833839825562417, "acc_norm": 0.36772486772486773, "acc_norm_stderr": 0.024833839825562417 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3968253968253968, "acc_stderr": 0.043758884927270605, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.043758884927270605 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6903225806451613, "acc_stderr": 0.026302774983517414, "acc_norm": 0.6903225806451613, "acc_norm_stderr": 0.026302774983517414 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.64, "acc_stderr": 0.048241815132442176, "acc_norm": 0.64, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7212121212121212, "acc_stderr": 0.03501438706296781, "acc_norm": 0.7212121212121212, "acc_norm_stderr": 0.03501438706296781 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7525252525252525, "acc_stderr": 0.030746300742124488, "acc_norm": 0.7525252525252525, "acc_norm_stderr": 0.030746300742124488 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.844559585492228, "acc_stderr": 0.026148483469153303, "acc_norm": 0.844559585492228, "acc_norm_stderr": 0.026148483469153303 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5666666666666667, "acc_stderr": 0.025124653525885117, "acc_norm": 0.5666666666666667, "acc_norm_stderr": 0.025124653525885117 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34074074074074073, "acc_stderr": 0.028897748741131143, "acc_norm": 0.340740740



