open-llm-leaderboard-old/details_abhishek__autotrain-llama3-orpo-v2
收藏数据集概述
该数据集是在模型 abhishek/autotrain-llama3-orpo-v2 的评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_abhishek__autotrain-llama3-orpo-v2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-23T04:36:11.252475 运行的最新结果:
python { "all": { "acc": 0.6698788537533582, "acc_stderr": 0.03166555844096611, "acc_norm": 0.6726183210568676, "acc_norm_stderr": 0.032295641802360386, "mc1": 0.35128518971848227, "mc1_stderr": 0.016711358163544403, "mc2": 0.5220046543193273, "mc2_stderr": 0.01490809814037497 }, "harness|arc:challenge|25": { "acc": 0.5836177474402731, "acc_stderr": 0.014405618279436172, "acc_norm": 0.621160409556314, "acc_norm_stderr": 0.014175915490000326 }, "harness|hellaswag|10": { "acc": 0.5903206532563234, "acc_stderr": 0.00490769472793569, "acc_norm": 0.7950607448715395, "acc_norm_stderr": 0.004028322654852749 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6518518518518519, "acc_stderr": 0.041153246103369526, "acc_norm": 0.6518518518518519, "acc_norm_stderr": 0.041153246103369526 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7171052631578947, "acc_stderr": 0.03665349695640767, "acc_norm": 0.7171052631578947, "acc_norm_stderr": 0.03665349695640767 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7547169811320755, "acc_stderr": 0.026480357179895702, "acc_norm": 0.7547169811320755, "acc_norm_stderr": 0.026480357179895702 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7916666666666666, "acc_stderr": 0.033961162058453336, "acc_norm": 0.7916666666666666, "acc_norm_stderr": 0.033961162058453336 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.630057803468208, "acc_stderr": 0.0368122963339432, "acc_norm": 0.630057803468208, "acc_norm_stderr": 0.0368122963339432 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5392156862745098, "acc_stderr": 0.049598599663841815, "acc_norm": 0.5392156862745098, "acc_norm_stderr": 0.049598599663841815 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.04292346959909281, "acc_norm": 0.76, "acc_norm_stderr": 0.04292346959909281 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6085106382978723, "acc_stderr": 0.03190701242326812, "acc_norm": 0.6085106382978723, "acc_norm_stderr": 0.03190701242326812 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.6052631578947368, "acc_stderr": 0.04598188057816542, "acc_norm": 0.6052631578947368, "acc_norm_stderr": 0.04598188057816542 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6413793103448275, "acc_stderr": 0.039966295748767186, "acc_norm": 0.6413793103448275, "acc_norm_stderr": 0.039966295748767186 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4312169312169312, "acc_stderr": 0.02550648169813821, "acc_norm": 0.4312169312169312, "acc_norm_stderr": 0.02550648169813821 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.48412698412698413, "acc_stderr": 0.04469881854072606, "acc_norm": 0.48412698412698413, "acc_norm_stderr": 0.04469881854072606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7935483870967742, "acc_stderr": 0.023025899617188695, "acc_norm": 0.7935483870967742, "acc_norm_stderr": 0.023025899617188695 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7636363636363637, "acc_stderr": 0.03317505930009181, "acc_norm": 0.7636363636363637, "acc_norm_stderr": 0.03317505930009181 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8383838383838383, "acc_stderr": 0.02622591986362927, "acc_norm": 0.8383838383838383, "acc_norm_stderr": 0.02622591986362927 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.917098445595855, "acc_stderr": 0.01989934131572178, "acc_norm": 0.917098445595855, "acc_norm_stderr": 0.01989934131572178 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.658974358974359, "acc_stderr": 0.02403548967633507, "acc_norm": 0.658974358974359, "acc_norm_stderr": 0.02403548967633507 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3814814814814815, "acc_stderr": 0.029616718927497593, "acc_norm": 0.3814814814814815



