open-llm-leaderboard-old/details_cloudyu__Mixtral_7Bx2_MoE
收藏数据集概述
数据集简介
该数据集是在对模型 cloudyu/Mixtral_7Bx2_MoE 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
数据集由 63 个配置组成,每个配置对应一个评估任务。数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_cloudyu__Mixtral_7Bx2_MoE", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2023-12-30T02:50:07.869164 的运行:
python { "all": { "acc": 0.6544585464368745, "acc_stderr": 0.03200088458372204, "acc_norm": 0.6547015253813414, "acc_norm_stderr": 0.032655083043548944, "mc1": 0.5348837209302325, "mc1_stderr": 0.017460849975873972, "mc2": 0.6723305286969269, "mc2_stderr": 0.01523375567555562 }, "harness|arc:challenge|25": { "acc": 0.6834470989761092, "acc_stderr": 0.013592431519068079, "acc_norm": 0.712457337883959, "acc_norm_stderr": 0.013226719056266129 }, "harness|hellaswag|10": { "acc": 0.7013543118900617, "acc_stderr": 0.004567287775700558, "acc_norm": 0.8745269866560446, "acc_norm_stderr": 0.003305774980082251 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6444444444444445, "acc_stderr": 0.04135176749720386, "acc_norm": 0.6444444444444445, "acc_norm_stderr": 0.04135176749720386 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7039473684210527, "acc_stderr": 0.03715062154998904, "acc_norm": 0.7039473684210527, "acc_norm_stderr": 0.03715062154998904 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.62, "acc_stderr": 0.048783173121456316, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.720754716981132, "acc_stderr": 0.027611163402399715, "acc_norm": 0.720754716981132, "acc_norm_stderr": 0.027611163402399715 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.0358687928008034, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.0358687928008034 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620333, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6589595375722543, "acc_stderr": 0.03614665424180826, "acc_norm": 0.6589595375722543, "acc_norm_stderr": 0.03614665424180826 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.45098039215686275, "acc_stderr": 0.04951218252396262, "acc_norm": 0.45098039215686275, "acc_norm_stderr": 0.04951218252396262 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.042923469599092816, "acc_norm": 0.76, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5787234042553191, "acc_stderr": 0.03227834510146267, "acc_norm": 0.5787234042553191, "acc_norm_stderr": 0.03227834510146267 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.04702880432049615, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5655172413793104, "acc_stderr": 0.04130740879555498, "acc_norm": 0.5655172413793104, "acc_norm_stderr": 0.04130740879555498 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4365079365079365, "acc_stderr": 0.025542846817400496, "acc_norm": 0.4365079365079365, "acc_norm_stderr": 0.025542846817400496 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5158730158730159, "acc_stderr": 0.044698818540726076, "acc_norm": 0.5158730158730159, "acc_norm_stderr": 0.044698818540726076 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7838709677419354, "acc_stderr": 0.023415293433568525, "acc_norm": 0.7838709677419354, "acc_norm_stderr": 0.023415293433568525 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5172413793103449, "acc_stderr": 0.035158955511656986, "acc_norm": 0.5172413793103449, "acc_norm_stderr": 0.035158955511656986 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.03192271569548301, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.03192271569548301 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8131313131313131, "acc_stderr": 0.027772533334218967, "acc_norm": 0.8131313131313131, "acc_norm_stderr": 0.027772533334218967 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9015544041450777, "acc_stderr": 0.02150024957603348, "acc_norm": 0.9015544041450777, "acc_norm_stderr": 0.02150024957603348 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6794871794871795, "acc_stderr": 0.02366129639396428, "acc_norm": 0.6794871794871795, "acc_norm_stderr": 0.02366129639396428 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.32222222222222224, "acc_stderr": 0.028493465091028593, "acc_norm": 0.32222222222222224, "acc_norm_stderr": 0.028493465091028593 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6848739495798319, "acc_stderr": 0.030176808288974337, "acc_norm": 0.6848739495798319, "acc_norm_stderr": 0.030176808288974337 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.36423841059602646, "acc_stderr": 0.03929111781242742, "acc_norm": 0.36423841059602646, "acc_norm_stderr": 0.03929111781242742 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8532110091743119, "acc_stderr": 0.01517314184512625, "acc_norm": 0.8532110091743119, "acc_norm_stderr": 0.01517314184512625 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5231481481481481, "acc_stderr": 0.034063153607115086, "acc_norm": 0.5231481481481481, "acc_norm_stderr": 0.034063153607115086 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8431372549019608, "acc_stderr": 0.02552472232455334, "acc_norm": 0.8431372549019608, "acc_norm_stderr": 0.02552472232455334 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.7974683544303798, "acc_stderr": 0.026160568246601446, "acc_norm": 0.7974683544303798, "acc_norm_stderr": 0.026160568246601446 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6905829596412556, "acc_stderr": 0.03102441174057221, "acc_norm": 0.6905829596412556, "acc_norm_stderr": 0.03102441174057221 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.7862595419847328, "acc_stderr": 0.0359546161177469, "acc_norm": 0.7862595419847328, "acc_norm_stderr": 0.0359546161177469 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8016528925619835, "acc_stderr": 0.03640118271990947, "acc_norm": 0.8016528925619835, "acc_norm_stderr": 0.03640118271990947 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7685185185185185, "acc_stderr": 0.04077494709252627, "acc_norm": 0.7685185185185185, "acc_norm_stderr": 0.04077494709252627 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7791411042944786, "acc_stderr": 0.03259177392742178, "acc_norm": 0.7791411042944786, "acc_norm_stderr": 0.03259177392742178 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.4642857142857143, "acc_stderr": 0.04733667890053756, "acc_norm": 0.4642857142857143, "acc_norm_stderr": 0.04733667890053756 }, "harness|hendrycksTest-management|5": { "acc": 0.7766990291262136, "acc_stderr": 0.04123553189891431, "acc_norm": 0.7766990291262136, "acc_norm_stderr": 0.04123553189891431 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8888888888888888, "acc_stderr": 0.020588491316092375, "acc_norm": 0.8888888888888888, "acc_norm_stderr": 0.020588491316092375 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8339719029374202, "acc_stderr": 0.013306478243066302, "acc_norm": 0.8339719029374202, "acc_norm_stderr": 0.013306478243066302 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7052023121387283, "acc_stderr": 0.024547617794803828, "acc_norm": 0.7052023121387283, "acc_norm_stderr": 0.024547617794803828 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.4558659217877095, "acc_stderr": 0.01665722942458631, "acc_norm": 0.4558659217877095, "acc_norm_stderr": 0.01665722942458631 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7352941176470589, "acc_stderr": 0.02526169121972948, "acc_norm": 0.7352941176470589, "acc_norm_stderr": 0.02526169121972948 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7106109324758842, "acc_stderr": 0.02575586592263295, "acc_norm": 0.7106109324758842, "acc_norm_stderr": 0.02575586592263295 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7376543209876543, "acc_stderr": 0.02447722285613511, "acc_norm": 0.7376543209876543, "acc_norm_stderr": 0.02447722285613511 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.46808510638297873, "acc_stderr": 0.029766675075873862, "acc_norm": 0.46808510638297873, "acc_norm_stderr": 0.029766675075873862 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.46740547588005216, "acc_stderr": 0.012743072942653354, "acc_norm": 0.46740547588005216, "acc_norm_stderr": 0.012743072942653354 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6801470588235294, "acc_stderr": 0.028332959514031204, "acc_norm": 0.6801470588235294, "acc_norm_stderr": 0.028332959514031204 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6781045751633987, "acc_stderr": 0.018901015322093092, "acc_norm": 0.6781045751633987, "acc_norm_stderr": 0.018901015322093092 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6727272727272727, "acc_stderr": 0.0449429086625209, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.0449429086625209 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7224489795918367, "acc_stderr": 0.028666857790274648, "acc_norm": 0.7224489795918367, "acc_norm_stderr": 0.028666857790274648 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8557213930348259, "acc_stderr": 0.024845753212306053, "acc_norm": 0.8557213930348259, "acc_norm_stderr": 0.024845753212306053 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.88, "acc_stderr": 0.03265986323710906, "acc_norm": 0.88, "acc_norm_stderr": 0.03265986323710906 }, "harness|hendrycksTest-virology|5": { "acc": 0.536144578313253, "acc_stderr": 0.038823108508905954, "acc_norm": 0.536144578313253, "acc_norm_stderr": 0.038823108508905954 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8421052631578947, "acc_stderr": 0.027966785859160893, "acc_norm": 0.8421052631578947, "acc_norm_stderr": 0.027966785859160893 }, "harness|truthfulqa:mc|0": { "mc1": 0.5348837209302325, "mc1_stderr": 0.017460849975873972, "mc2": 0.6723305286969269, "mc2_stderr": 0.01523375567555562 }, "harness|winogrande|5": { "acc": 0.8121546961325967, "acc_stderr": 0.010977481103435091 }, "harness|gsm8k|5": { "acc": 0.6846095526914329, "acc_stderr": 0.012799353675801832 } }



