open-llm-leaderboard-old/details_alignment-handbook__zephyr-7b-dpo-full
收藏数据集概述
数据集摘要
该数据集是在模型alignment-handbook/zephyr-7b-dpo-full在Open LLM Leaderboard上的评估运行期间自动创建的。
数据集组成
- 数据集由63个配置组成,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
最新结果
以下是2024-04-08T01:29:43.310904运行的最新结果:
python { "all": { "acc": 0.5926699986918813, "acc_stderr": 0.03321334145058982, "acc_norm": 0.6004002100600775, "acc_norm_stderr": 0.03393325403898488, "mc1": 0.33659730722154224, "mc1_stderr": 0.016542412809494877, "mc2": 0.4740788248392144, "mc2_stderr": 0.01579474521827581 }, "harness|arc:challenge|25": { "acc": 0.5972696245733788, "acc_stderr": 0.01433223630679015, "acc_norm": 0.628839590443686, "acc_norm_stderr": 0.014117971901142824 }, "harness|hellaswag|10": { "acc": 0.6531567416849233, "acc_stderr": 0.004749926091672248, "acc_norm": 0.8444532961561442, "acc_norm_stderr": 0.0036168436913607653 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.32, "acc_stderr": 0.04688261722621503, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621503 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5407407407407407, "acc_stderr": 0.04304979692464242, "acc_norm": 0.5407407407407407, "acc_norm_stderr": 0.04304979692464242 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.625, "acc_stderr": 0.039397364351956274, "acc_norm": 0.625, "acc_norm_stderr": 0.039397364351956274 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.55, "acc_stderr": 0.05, "acc_norm": 0.55, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6452830188679245, "acc_stderr": 0.02944517532819959, "acc_norm": 0.6452830188679245, "acc_norm_stderr": 0.02944517532819959 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7013888888888888, "acc_stderr": 0.03827052357950756, "acc_norm": 0.7013888888888888, "acc_norm_stderr": 0.03827052357950756 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252604 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3627450980392157, "acc_stderr": 0.04784060704105653, "acc_norm": 0.3627450980392157, "acc_norm_stderr": 0.04784060704105653 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5063829787234042, "acc_stderr": 0.032683358999363366, "acc_norm": 0.5063829787234042, "acc_norm_stderr": 0.032683358999363366 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.38596491228070173, "acc_stderr": 0.045796394220704334, "acc_norm": 0.38596491228070173, "acc_norm_stderr": 0.045796394220704334 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.496551724137931, "acc_stderr": 0.041665675771015785, "acc_norm": 0.496551724137931, "acc_norm_stderr": 0.041665675771015785 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3835978835978836, "acc_stderr": 0.025043757318520196, "acc_norm": 0.3835978835978836, "acc_norm_stderr": 0.025043757318520196 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3968253968253968, "acc_stderr": 0.04375888492727061, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.04375888492727061 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.04943110704237102, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6903225806451613, "acc_stderr": 0.026302774983517414, "acc_norm": 0.6903225806451613, "acc_norm_stderr": 0.026302774983517414 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5172413793103449, "acc_stderr": 0.03515895551165698, "acc_norm": 0.5172413793103449, "acc_norm_stderr": 0.03515895551165698 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.64, "acc_stderr": 0.04824181513244218, "acc_norm": 0.64, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.032876667586034906, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.032876667586034906 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7626262626262627, "acc_stderr": 0.0303137105381989, "acc_norm": 0.7626262626262627, "acc_norm_stderr": 0.0303137105381989 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8082901554404145, "acc_stderr": 0.02840895362624528, "acc_norm": 0.8082901554404145, "acc_norm_stderr": 0.02840895362624528 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.541025641025641, "acc_stderr": 0.025265525491284295, "acc_norm": 0.541025641025641, "acc_norm_stderr": 0.025265525491284295 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3037037037037037, "acc_stderr": 0.028037929969114993, "acc_norm": 0.3037037037037037, "acc_norm_stderr": 0.028037929969114993 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6050420168067226, "acc_stderr": 0.031753678460966245, "acc_norm": 0.6050420168067226, "acc_norm_stderr":



