open-llm-leaderboard-old/details_fblgit__una-xaberius-34b-v1beta
收藏数据集概述
数据集摘要
该数据集是在模型 fblgit/una-xaberius-34b-v1beta 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中作为一个特定的分片找到,分片名称使用运行的时间戳。
- "train" 分片始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_fblgit__una-xaberius-34b-v1beta", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2023-12-09T11:16:37.904970 的运行: python { "all": { "acc": 0.7767755521790687, "acc_stderr": 0.027620532165746333, "acc_norm": 0.7816657153803744, "acc_norm_stderr": 0.02813443103457644, "mc1": 0.4602203182374541, "mc1_stderr": 0.01744801722396088, "mc2": 0.6144919168362304, "mc2_stderr": 0.015159547860602553 }, "harness|arc:challenge|25": { "acc": 0.6791808873720137, "acc_stderr": 0.013640943091946524, "acc_norm": 0.7039249146757679, "acc_norm_stderr": 0.013340916085246254 }, "harness|hellaswag|10": { "acc": 0.6743676558454491, "acc_stderr": 0.004676529200753001, "acc_norm": 0.8676558454491137, "acc_norm_stderr": 0.0033817200071652002 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.7481481481481481, "acc_stderr": 0.03749850709174021, "acc_norm": 0.7481481481481481, "acc_norm_stderr": 0.03749850709174021 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.8947368421052632, "acc_stderr": 0.024974533450920693, "acc_norm": 0.8947368421052632, "acc_norm_stderr": 0.024974533450920693 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.81, "acc_stderr": 0.03942772444036623, "acc_norm": 0.81, "acc_norm_stderr": 0.03942772444036623 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.8113207547169812, "acc_stderr": 0.02407999513006224, "acc_norm": 0.8113207547169812, "acc_norm_stderr": 0.02407999513006224 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8958333333333334, "acc_stderr": 0.025545239210256917, "acc_norm": 0.8958333333333334, "acc_norm_stderr": 0.025545239210256917 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.67, "acc_stderr": 0.047258156262526066, "acc_norm": 0.67, "acc_norm_stderr": 0.047258156262526066 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7514450867052023, "acc_stderr": 0.03295304696818318, "acc_norm": 0.7514450867052023, "acc_norm_stderr": 0.03295304696818318 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5882352941176471, "acc_stderr": 0.04897104952726366, "acc_norm": 0.5882352941176471, "acc_norm_stderr": 0.04897104952726366 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.81, "acc_stderr": 0.03942772444036624, "acc_norm": 0.81, "acc_norm_stderr": 0.03942772444036624 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7787234042553192, "acc_stderr": 0.02713634960242406, "acc_norm": 0.7787234042553192, "acc_norm_stderr": 0.02713634960242406 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.631578947368421, "acc_stderr": 0.04537815354939391, "acc_norm": 0.631578947368421, "acc_norm_stderr": 0.04537815354939391 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7931034482758621, "acc_stderr": 0.03375672449560554, "acc_norm": 0.7931034482758621, "acc_norm_stderr": 0.03375672449560554 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.7619047619047619, "acc_stderr": 0.02193587808118476, "acc_norm": 0.7619047619047619, "acc_norm_stderr": 0.02193587808118476 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5396825396825397, "acc_stderr": 0.04458029125470973, "acc_norm": 0.5396825396825397, "acc_norm_stderr": 0.04458029125470973 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.54, "acc_stderr": 0.05009082659620333, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.9161290322580645, "acc_stderr": 0.015769027496775667, "acc_norm": 0.9161290322580645, "acc_norm_stderr": 0.015769027496775667 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6551724137931034, "acc_stderr": 0.03344283744280458, "acc_norm": 0.6551724137931034, "acc_norm_stderr": 0.03344283744280458 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.87, "acc_stderr": 0.03379976689896309, "acc_norm": 0.87, "acc_norm_stderr": 0.03379976689896309 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8848484848484849, "acc_stderr": 0.024925699798115344, "acc_norm": 0.8848484848484849, "acc_norm_stderr": 0.024925699798115344 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.9292929292929293, "acc_stderr": 0.0182631054201995, "acc_norm": 0.9292929292929293, "acc_norm_stderr": 0.0182631054201995 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9792746113989638, "acc_stderr": 0.010281417011909036, "acc_norm": 0.9792746113989638, "acc_norm_stderr": 0.010281417011909036 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.841025641025641, "acc_stderr": 0.01853930114094035, "acc_norm": 0.841025641025641, "acc_norm_stderr": 0.01853930114094035 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.4666666666666667, "acc_stderr": 0.030417716961717474, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.030417716961717474 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.886554



