open-llm-leaderboard-old/details_nlpguy__Hermes-low-tune
收藏数据集概述
该数据集是在对模型 nlpguy/Hermes-low-tune 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。数据集包含 63 个配置,每个配置对应一个评估任务。数据集从 1 次运行中创建,每次运行的详细信息可以在每个配置中找到,以运行的时间戳命名的特定分片形式存储。"train" 分片始终指向最新的结果。
数据集结构
数据集包含以下配置:
harness_arc_challenge_25harness_gsm8k_5harness_hellaswag_10harness_hendrycksTest_5
每个配置包含多个分片,包括特定运行的时间戳分片和最新的分片。
数据加载示例
以下是加载数据集的示例代码:
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_nlpguy__Hermes-low-tune", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-05T01:41:45.881402 运行 的最新结果:
python { "all": { "acc": 0.6377579933924009, "acc_stderr": 0.03229992986349285, "acc_norm": 0.6395007863322063, "acc_norm_stderr": 0.032947905417029036, "mc1": 0.3488372093023256, "mc1_stderr": 0.016684419859986893, "mc2": 0.5136505046097176, "mc2_stderr": 0.014944315518959861 }, "harness|arc:challenge|25": { "acc": 0.6023890784982935, "acc_stderr": 0.014301752223279547, "acc_norm": 0.6399317406143344, "acc_norm_stderr": 0.01402751681458519 }, "harness|hellaswag|10": { "acc": 0.6439952200756821, "acc_stderr": 0.0047783807588511334, "acc_norm": 0.8374825731925911, "acc_norm_stderr": 0.0036817082825814575 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5703703703703704, "acc_stderr": 0.042763494943765995, "acc_norm": 0.5703703703703704, "acc_norm_stderr": 0.042763494943765995 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.0373852067611967, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.0373852067611967 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.56, "acc_stderr": 0.04988876515698589, "acc_norm": 0.56, "acc_norm_stderr": 0.04988876515698589 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6754716981132075, "acc_stderr": 0.02881561571343211, "acc_norm": 0.6754716981132075, "acc_norm_stderr": 0.02881561571343211 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7708333333333334, "acc_stderr": 0.03514697467862388, "acc_norm": 0.7708333333333334, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.04688261722621505, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6069364161849711, "acc_stderr": 0.0372424959581773, "acc_norm": 0.6069364161849711, "acc_norm_stderr": 0.0372424959581773 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4117647058823529, "acc_stderr": 0.04897104952726366, "acc_norm": 0.4117647058823529, "acc_norm_stderr": 0.04897104952726366 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5531914893617021, "acc_stderr": 0.0325005368436584, "acc_norm": 0.5531914893617021, "acc_norm_stderr": 0.0325005368436584 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4824561403508772, "acc_stderr": 0.04700708033551038, "acc_norm": 0.4824561403508772, "acc_norm_stderr": 0.04700708033551038 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5310344827586206, "acc_stderr": 0.04158632762097828, "acc_norm": 0.5310344827586206, "acc_norm_stderr": 0.04158632762097828 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42063492063492064, "acc_stderr": 0.025424835086924, "acc_norm": 0.42063492063492064, "acc_norm_stderr": 0.025424835086924 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4444444444444444, "acc_stderr": 0.04444444444444449, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.04444444444444449 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7774193548387097, "acc_stderr": 0.023664216671642514, "acc_norm": 0.7774193548387097, "acc_norm_stderr": 0.023664216671642514 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5369458128078818, "acc_stderr": 0.035083705204426656, "acc_norm": 0.5369458128078818, "acc_norm_stderr": 0.035083705204426656 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8, "acc_stderr": 0.031234752377721175, "acc_norm": 0.8, "acc_norm_stderr": 0.031234752377721175 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.797979797979798, "acc_stderr": 0.028606204289229865, "acc_norm": 0.797979797979798, "acc_norm_stderr": 0.028606204289229865 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8860103626943006, "acc_stderr": 0.022935144053919443, "acc_norm": 0.8860103626943006, "acc_norm_stderr": 0.022935144053919443 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6102564102564103, "acc_stderr": 0.024726967886647074, "acc_norm": 0.6102564102564103, "acc_norm_stderr": 0.024726967886647074 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2962962962962963, "acc_stderr": 0.027840811495871934, "acc_norm": 0.2962962962962963, "acc_norm_stderr": 0.027840811495871934 }, "harness|hendrycksTest-high_school_microeconom



