open-llm-leaderboard-old/details_abideen__PhigRange-2.7B-slerp
收藏数据集概述
数据集简介
该数据集是在评估模型abideen/PhigRange-2.7B-slerp在Open LLM Leaderboard上的自动创建的。数据集包含63个配置,每个配置对应一个评估任务。
数据集结构
数据集由1次运行创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train"分割总是指向最新的结果。
结果配置
一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_abideen__PhigRange-2.7B-slerp", "harness_winogrande_5", split="train")
最新结果
以下是最新结果的摘要: python { "all": { "acc": 0.5767941428995237, "acc_stderr": 0.03387553112768988, "acc_norm": 0.5768839451218056, "acc_norm_stderr": 0.034576125147848885, "mc1": 0.3769889840881273, "mc1_stderr": 0.016965517578930354, "mc2": 0.5316928803057882, "mc2_stderr": 0.015529301409234428 }, "harness|arc:challenge|25": { "acc": 0.5921501706484642, "acc_stderr": 0.014361097288449703, "acc_norm": 0.6168941979522184, "acc_norm_stderr": 0.014206472661672874 }, "harness|hellaswag|10": { "acc": 0.5813582951603267, "acc_stderr": 0.004923281841828515, "acc_norm": 0.7663811989643498, "acc_norm_stderr": 0.004222676709104567 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4740740740740741, "acc_stderr": 0.04313531696750574, "acc_norm": 0.4740740740740741, "acc_norm_stderr": 0.04313531696750574 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5855263157894737, "acc_stderr": 0.04008973785779206, "acc_norm": 0.5855263157894737, "acc_norm_stderr": 0.04008973785779206 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.04943110704237102, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5924528301886792, "acc_stderr": 0.030242233800854494, "acc_norm": 0.5924528301886792, "acc_norm_stderr": 0.030242233800854494 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6180555555555556, "acc_stderr": 0.040629907841466674, "acc_norm": 0.6180555555555556, "acc_norm_stderr": 0.040629907841466674 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.45, "acc_stderr": 0.04999999999999999, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.42, "acc_stderr": 0.04960449637488584, "acc_norm": 0.42, "acc_norm_stderr": 0.04960449637488584 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5895953757225434, "acc_stderr": 0.03750757044895537, "acc_norm": 0.5895953757225434, "acc_norm_stderr": 0.03750757044895537 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.35294117647058826, "acc_stderr": 0.047551296160629475, "acc_norm": 0.35294117647058826, "acc_norm_stderr": 0.047551296160629475 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.46808510638297873, "acc_stderr": 0.03261936918467382, "acc_norm": 0.46808510638297873, "acc_norm_stderr": 0.03261936918467382 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.38596491228070173, "acc_stderr": 0.04579639422070434, "acc_norm": 0.38596491228070173, "acc_norm_stderr": 0.04579639422070434 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4896551724137931, "acc_stderr": 0.04165774775728763, "acc_norm": 0.4896551724137931, "acc_norm_stderr": 0.04165774775728763 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42328042328042326, "acc_stderr": 0.025446365634406772, "acc_norm": 0.42328042328042326, "acc_norm_stderr": 0.025446365634406772 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3968253968253968, "acc_stderr": 0.04375888492727061, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.04375888492727061 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6612903225806451, "acc_stderr": 0.026923446059302837, "acc_norm": 0.6612903225806451, "acc_norm_stderr": 0.026923446059302837 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4433497536945813, "acc_stderr": 0.03495334582162934, "acc_norm": 0.4433497536945813, "acc_norm_stderr": 0.03495334582162934 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.696969696969697, "acc_stderr": 0.035886248000917075, "acc_norm": 0.696969696969697, "acc_norm_stderr": 0.035886248000917075 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7222222222222222, "acc_stderr": 0.03191178226713547, "acc_norm": 0.7222222222222222, "acc_norm_stderr": 0.03191178226713547 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7772020725388601, "acc_stderr": 0.03003114797764154, "acc_norm": 0.7772020725388601, "acc_norm_stderr": 0.03003114797764154 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5769230769230769, "acc_stderr": 0.025049197876042338, "acc_norm": 0.5769230769230769, "acc_norm_stderr": 0.025049197876042338 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.028742040903948503, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.028742040903948503 }, "



