open-llm-leaderboard-old/details_genaicore3434__Mistral-7b-instruct-v0.2-summ-sft-lp-e1
收藏数据集概述
数据集摘要
该数据集是在评估模型 genaicore3434/Mistral-7b-instruct-v0.2-summ-sft-lp-e1 在 Open LLM Leaderboard 上的运行过程中自动创建的。数据集包含63个配置,每个配置对应一个评估任务。数据集从2次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
数据集加载
要加载特定运行的详细信息,可以使用以下代码: python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_genaicore3434__Mistral-7b-instruct-v0.2-summ-sft-lp-e1", "harness_winogrande_5", split="train")
最新结果
以下是 最新结果 的摘要: python { "all": { "acc": 0.59052978065543, "acc_stderr": 0.03349268505074206, "acc_norm": 0.5952047695238794, "acc_norm_stderr": 0.03418111471832376, "mc1": 0.4663402692778458, "mc1_stderr": 0.017463793867168106, "mc2": 0.6325766616332602, "mc2_stderr": 0.015487593519142183 }, "harness|arc:challenge|25": { "acc": 0.5477815699658704, "acc_stderr": 0.014544519880633825, "acc_norm": 0.5955631399317406, "acc_norm_stderr": 0.01434203648343618 }, "harness|hellaswag|10": { "acc": 0.6301533559051982, "acc_stderr": 0.004817763581410245, "acc_norm": 0.8227444732125074, "acc_norm_stderr": 0.0038110434120246627 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.562962962962963, "acc_stderr": 0.04284958639753401, "acc_norm": 0.562962962962963, "acc_norm_stderr": 0.04284958639753401 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6052631578947368, "acc_stderr": 0.039777499346220734, "acc_norm": 0.6052631578947368, "acc_norm_stderr": 0.039777499346220734 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.57, "acc_stderr": 0.04975698519562428, "acc_norm": 0.57, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6377358490566037, "acc_stderr": 0.0295822451283843, "acc_norm": 0.6377358490566037, "acc_norm_stderr": 0.0295822451283843 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6527777777777778, "acc_stderr": 0.039812405437178615, "acc_norm": 0.6527777777777778, "acc_norm_stderr": 0.039812405437178615 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.54, "acc_stderr": 0.05009082659620333, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620333 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5780346820809249, "acc_stderr": 0.0376574669386515, "acc_norm": 0.5780346820809249, "acc_norm_stderr": 0.0376574669386515 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4019607843137255, "acc_stderr": 0.04878608714466996, "acc_norm": 0.4019607843137255, "acc_norm_stderr": 0.04878608714466996 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5148936170212766, "acc_stderr": 0.03267151848924777, "acc_norm": 0.5148936170212766, "acc_norm_stderr": 0.03267151848924777 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.39473684210526316, "acc_stderr": 0.045981880578165414, "acc_norm": 0.39473684210526316, "acc_norm_stderr": 0.045981880578165414 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.593103448275862, "acc_stderr": 0.04093793981266236, "acc_norm": 0.593103448275862, "acc_norm_stderr": 0.04093793981266236 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.37566137566137564, "acc_stderr": 0.024942368931159788, "acc_norm": 0.37566137566137564, "acc_norm_stderr": 0.024942368931159788 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3968253968253968, "acc_stderr": 0.043758884927270605, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.043758884927270605 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6612903225806451, "acc_stderr": 0.026923446059302837, "acc_norm": 0.6612903225806451, "acc_norm_stderr": 0.026923446059302837 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.0351760354036101, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.0351760354036101 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.57, "acc_stderr": 0.04975698519562428, "acc_norm": 0.57, "acc_norm_stderr": 0.04975698519562428 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.703030303030303, "acc_stderr": 0.035679697722680495, "acc_norm": 0.703030303030303, "acc_norm_stderr": 0.035679697722680495 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7424242424242424, "acc_stderr": 0.03115626951964683, "acc_norm": 0.7424242424242424, "acc_norm_stderr": 0.03115626951964683 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8341968911917098, "acc_stderr": 0.026839845022314415, "acc_norm": 0.8341968911917098, "acc_norm_stderr": 0.026839845022314415 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5384615384615384, "acc_stderr": 0.025275892070240644, "acc_norm": 0.5384615384615384, "acc_norm_stderr": 0.025275892070240644 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.35555555555555557, "acc_stderr": 0.029185714949857406, "acc_norm": 0.35555555555555557, "acc_norm_stderr": 0.029185714949857406 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.554621848739



