open-llm-leaderboard-old/details_JaeyeonKang__CCK_Gony_v3.3
收藏数据集概述
数据集来源
该数据集是在对模型 JaeyeonKang/CCK_Gony_v3.3 进行评估运行期间自动创建的。
数据集组成
数据集由63个配置组成,每个配置对应一个评估任务。数据集是从1次运行中创建的,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。"train" 拆分始终指向最新的结果。
数据集加载
要加载特定运行的详细信息,可以使用以下代码: python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_JaeyeonKang__CCK_Gony_v3.3", "harness_winogrande_5", split="train")
最新结果
以下是 2024-02-02T14:16:13.968146 运行的最新结果: python { "all": { "acc": 0.7127245389126519, "acc_stderr": 0.030324827741987404, "acc_norm": 0.7169197048023546, "acc_norm_stderr": 0.030907560438510125, "mc1": 0.5250917992656059, "mc1_stderr": 0.017481446804104017, "mc2": 0.6741286843212236, "mc2_stderr": 0.015021442699186793 }, "harness|arc:challenge|25": { "acc": 0.674061433447099, "acc_stderr": 0.013697432466693242, "acc_norm": 0.7039249146757679, "acc_norm_stderr": 0.013340916085246252 }, "harness|hellaswag|10": { "acc": 0.6921927902808206, "acc_stderr": 0.004606429684604527, "acc_norm": 0.8788090021907986, "acc_norm_stderr": 0.0032568214188573178 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6814814814814815, "acc_stderr": 0.04024778401977108, "acc_norm": 0.6814814814814815, "acc_norm_stderr": 0.04024778401977108 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7960526315789473, "acc_stderr": 0.032790004063100495, "acc_norm": 0.7960526315789473, "acc_norm_stderr": 0.032790004063100495 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.76, "acc_stderr": 0.04292346959909283, "acc_norm": 0.76, "acc_norm_stderr": 0.04292346959909283 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7811320754716982, "acc_stderr": 0.0254478638251086, "acc_norm": 0.7811320754716982, "acc_norm_stderr": 0.0254478638251086 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7986111111111112, "acc_stderr": 0.033536474697138406, "acc_norm": 0.7986111111111112, "acc_norm_stderr": 0.033536474697138406 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.57, "acc_stderr": 0.04975698519562428, "acc_norm": 0.57, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7456647398843931, "acc_stderr": 0.0332055644308557, "acc_norm": 0.7456647398843931, "acc_norm_stderr": 0.0332055644308557 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4411764705882353, "acc_stderr": 0.049406356306056595, "acc_norm": 0.4411764705882353, "acc_norm_stderr": 0.049406356306056595 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.81, "acc_stderr": 0.039427724440366234, "acc_norm": 0.81, "acc_norm_stderr": 0.039427724440366234 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6723404255319149, "acc_stderr": 0.03068302084323101, "acc_norm": 0.6723404255319149, "acc_norm_stderr": 0.03068302084323101 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.6228070175438597, "acc_stderr": 0.04559522141958216, "acc_norm": 0.6228070175438597, "acc_norm_stderr": 0.04559522141958216 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6620689655172414, "acc_stderr": 0.039417076320648906, "acc_norm": 0.6620689655172414, "acc_norm_stderr": 0.039417076320648906 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4947089947089947, "acc_stderr": 0.02574986828855657, "acc_norm": 0.4947089947089947, "acc_norm_stderr": 0.02574986828855657 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5634920634920635, "acc_stderr": 0.04435932892851466, "acc_norm": 0.5634920634920635, "acc_norm_stderr": 0.04435932892851466 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8419354838709677, "acc_stderr": 0.020752831511875278, "acc_norm": 0.8419354838709677, "acc_norm_stderr": 0.020752831511875278 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.6009852216748769, "acc_stderr": 0.034454876862647144, "acc_norm": 0.6009852216748769, "acc_norm_stderr": 0.034454876862647144 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7818181818181819, "acc_stderr": 0.03225078108306289, "acc_norm": 0.7818181818181819, "acc_norm_stderr": 0.03225078108306289 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8636363636363636, "acc_stderr": 0.024450155973189835, "acc_norm": 0.8636363636363636, "acc_norm_stderr": 0.024450155973189835 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9481865284974094, "acc_stderr": 0.01599622932024412, "acc_norm": 0.9481865284974094, "acc_norm_stderr": 0.01599622932024412 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7025641025641025, "acc_stderr": 0.023177408131465942, "acc_norm": 0.7025641025641025, "acc_norm_stderr": 0.023177408131465942 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.37777777777777777, "acc_stderr": 0.02956070739246572, "acc_norm": 0.37777777777777777, "acc_norm_stderr": 0.02956070739246572 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.7941176470588235, "acc_stderr



