open-llm-leaderboard-old/details_karakuri-ai__karakuri-lm-70b-chat-v0.1
收藏数据集概述
数据集简介
该数据集是在评估模型karakuri-ai/karakuri-lm-70b-chat-v0.1在Open LLM Leaderboard上的自动创建的。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建次数:从1次运行中创建。每个运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。
- 最新结果:"train"分割总是指向最新的结果。
- 结果汇总:一个额外的配置"results"存储所有运行的汇总结果,用于计算和显示在Open LLM Leaderboard上的汇总指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_karakuri-ai__karakuri-lm-70b-chat-v0.1", "harness_winogrande_5", split="train")
最新结果
以下是最新结果从运行2024-02-02T05:58:35.536012: python { "all": { "acc": 0.5939008681138529, "acc_stderr": 0.0331497474958087, "acc_norm": 0.5979307998153371, "acc_norm_stderr": 0.033826160874214035, "mc1": 0.36474908200734396, "mc1_stderr": 0.016850961061720116, "mc2": 0.513917774256522, "mc2_stderr": 0.014972875043047422 }, "harness|arc:challenge|25": { "acc": 0.5716723549488054, "acc_stderr": 0.014460496367599017, "acc_norm": 0.6151877133105802, "acc_norm_stderr": 0.014218371065251104 }, "harness|hellaswag|10": { "acc": 0.637024497112129, "acc_stderr": 0.0047987512815608376, "acc_norm": 0.8313085042820155, "acc_norm_stderr": 0.003737138752336941 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4888888888888889, "acc_stderr": 0.04318275491977976, "acc_norm": 0.4888888888888889, "acc_norm_stderr": 0.04318275491977976 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6710526315789473, "acc_stderr": 0.038234289699266046, "acc_norm": 0.6710526315789473, "acc_norm_stderr": 0.038234289699266046 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6188679245283019, "acc_stderr": 0.029890609686286644, "acc_norm": 0.6188679245283019, "acc_norm_stderr": 0.029890609686286644 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6597222222222222, "acc_stderr": 0.039621355734862175, "acc_norm": 0.6597222222222222, "acc_norm_stderr": 0.039621355734862175 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5780346820809249, "acc_stderr": 0.0376574669386515, "acc_norm": 0.5780346820809249, "acc_norm_stderr": 0.0376574669386515 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.28431372549019607, "acc_stderr": 0.04488482852329017, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.64, "acc_stderr": 0.04824181513244218, "acc_norm": 0.64, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5106382978723404, "acc_stderr": 0.03267862331014063, "acc_norm": 0.5106382978723404, "acc_norm_stderr": 0.03267862331014063 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.41228070175438597, "acc_stderr": 0.046306532033665956, "acc_norm": 0.41228070175438597, "acc_norm_stderr": 0.046306532033665956 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5448275862068965, "acc_stderr": 0.04149886942192118, "acc_norm": 0.5448275862068965, "acc_norm_stderr": 0.04149886942192118 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.024870815251057093, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.024870815251057093 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3492063492063492, "acc_stderr": 0.04263906892795132, "acc_norm": 0.3492063492063492, "acc_norm_stderr": 0.04263906892795132 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6774193548387096, "acc_stderr": 0.02659308451657227, "acc_norm": 0.6774193548387096, "acc_norm_stderr": 0.02659308451657227 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3891625615763547, "acc_stderr": 0.034304624161038716, "acc_norm": 0.3891625615763547, "acc_norm_stderr": 0.034304624161038716 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.0328766675860349, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.0328766675860349 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7424242424242424, "acc_stderr": 0.031156269519646836, "acc_norm": 0.7424242424242424, "acc_norm_stderr": 0.031156269519646836 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8082901554404145, "acc_stderr": 0.028408953626245265, "acc_norm": 0.8082901554404145, "acc_norm_stderr": 0.028408953626245265 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6128205128205129, "acc_stderr": 0.02469721693087894, "acc_norm": 0.6128205128205129, "acc_norm_stderr": 0.02469721693087894 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3074074074074074, "acc_stderr": 0.028133252578815632,



