open-llm-leaderboard-old/details_Steelskull__VerA-Etheria-55b
收藏数据集概述
数据集摘要
该数据集是在模型 Steelskull/VerA-Etheria-55b 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
数据集由 63 个配置组成,每个配置对应一个评估任务。数据集是从 1 次运行中创建的,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Steelskull__VerA-Etheria-55b", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-25T17:11:24.913488 运行的最新结果:
python { "all": { "acc": 0.7263827073537332, "acc_stderr": 0.029170013986474255, "acc_norm": 0.7348687053269002, "acc_norm_stderr": 0.029706986665856413, "mc1": 0.379436964504284, "mc1_stderr": 0.016987039266142995, "mc2": 0.5210415817923857, "mc2_stderr": 0.01617919766526897 }, "harness|arc:challenge|25": { "acc": 0.6083617747440273, "acc_stderr": 0.014264122124938218, "acc_norm": 0.6424914675767918, "acc_norm_stderr": 0.014005494275916573 }, "harness|hellaswag|10": { "acc": 0.6434973112925712, "acc_stderr": 0.004779872250633708, "acc_norm": 0.8145787691694881, "acc_norm_stderr": 0.0038784463615532884 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.674074074074074, "acc_stderr": 0.040491220417025055, "acc_norm": 0.674074074074074, "acc_norm_stderr": 0.040491220417025055 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.8289473684210527, "acc_stderr": 0.0306436070716771, "acc_norm": 0.8289473684210527, "acc_norm_stderr": 0.0306436070716771 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7849056603773585, "acc_stderr": 0.02528839450289137, "acc_norm": 0.7849056603773585, "acc_norm_stderr": 0.02528839450289137 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8680555555555556, "acc_stderr": 0.02830096838204443, "acc_norm": 0.8680555555555556, "acc_norm_stderr": 0.02830096838204443 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956911, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.58, "acc_stderr": 0.04960449637488584, "acc_norm": 0.58, "acc_norm_stderr": 0.04960449637488584 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6994219653179191, "acc_stderr": 0.0349610148119118, "acc_norm": 0.6994219653179191, "acc_norm_stderr": 0.0349610148119118 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.45098039215686275, "acc_stderr": 0.049512182523962625, "acc_norm": 0.45098039215686275, "acc_norm_stderr": 0.049512182523962625 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.81, "acc_stderr": 0.039427724440366234, "acc_norm": 0.81, "acc_norm_stderr": 0.039427724440366234 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7531914893617021, "acc_stderr": 0.02818544130123409, "acc_norm": 0.7531914893617021, "acc_norm_stderr": 0.02818544130123409 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.543859649122807, "acc_stderr": 0.046854730419077895, "acc_norm": 0.543859649122807, "acc_norm_stderr": 0.046854730419077895 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7448275862068966, "acc_stderr": 0.03632984052707842, "acc_norm": 0.7448275862068966, "acc_norm_stderr": 0.03632984052707842 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.5529100529100529, "acc_stderr": 0.025606723995777025, "acc_norm": 0.5529100529100529, "acc_norm_stderr": 0.025606723995777025 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5158730158730159, "acc_stderr": 0.044698818540726076, "acc_norm": 0.5158730158730159, "acc_norm_stderr": 0.044698818540726076 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.9032258064516129, "acc_stderr": 0.016818943416345197, "acc_norm": 0.9032258064516129, "acc_norm_stderr": 0.016818943416345197 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6157635467980296, "acc_stderr": 0.034223985656575494, "acc_norm": 0.6157635467980296, "acc_norm_stderr": 0.034223985656575494 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8363636363636363, "acc_stderr": 0.028887872395487946, "acc_norm": 0.8363636363636363, "acc_norm_stderr": 0.028887872395487946 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9292929292929293, "acc_stderr": 0.018263105420199505, "acc_norm": 0.9292929292929293, "acc_norm_stderr": 0.018263105420199505 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9740932642487047, "acc_stderr": 0.01146452335695318, "acc_norm": 0.9740932642487047, "acc_norm_stderr": 0.01146452335695318 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7743589743589744, "acc_stderr": 0.021193632525148522, "acc_norm": 0.7743589743589744, "acc_norm_stderr": 0.021193632525148522 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.02944316932303154, "acc_norm": 0.37037037037037035, "acc_norm_stderr



