open-llm-leaderboard-old/details_EmbeddedLLM__Mistral-7B-Merge-14-v0.3
收藏数据集概述
数据集简介
该数据集是在对模型 EmbeddedLLM/Mistral-7B-Merge-14-v0.3 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
最新结果
以下是 2023-12-23T16:20:58.598253 运行的最新结果:
python { "all": { "acc": 0.6461774129433163, "acc_stderr": 0.032111885448396486, "acc_norm": 0.6473493445530335, "acc_norm_stderr": 0.03275775413439696, "mc1": 0.40024479804161567, "mc1_stderr": 0.017151605555749138, "mc2": 0.5780394878984443, "mc2_stderr": 0.015529814806437723 }, "harness|arc:challenge|25": { "acc": 0.6160409556313993, "acc_stderr": 0.01421244498065189, "acc_norm": 0.659556313993174, "acc_norm_stderr": 0.013847460518892978 }, "harness|hellaswag|10": { "acc": 0.6740689105755826, "acc_stderr": 0.004677637463391395, "acc_norm": 0.8529177454690301, "acc_norm_stderr": 0.0035346403488166708 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6296296296296297, "acc_stderr": 0.04171654161354543, "acc_norm": 0.6296296296296297, "acc_norm_stderr": 0.04171654161354543 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7171052631578947, "acc_stderr": 0.03665349695640767, "acc_norm": 0.7171052631578947, "acc_norm_stderr": 0.03665349695640767 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6943396226415094, "acc_stderr": 0.028353298073322666, "acc_norm": 0.6943396226415094, "acc_norm_stderr": 0.028353298073322666 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7569444444444444, "acc_stderr": 0.035868792800803406, "acc_norm": 0.7569444444444444, "acc_norm_stderr": 0.035868792800803406 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.53, "acc_stderr": 0.050161355804659205, "acc_norm": 0.53, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.27, "acc_stderr": 0.0446196043338474, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6473988439306358, "acc_stderr": 0.03643037168958548, "acc_norm": 0.6473988439306358, "acc_norm_stderr": 0.03643037168958548 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4019607843137255, "acc_stderr": 0.04878608714466996, "acc_norm": 0.4019607843137255, "acc_norm_stderr": 0.04878608714466996 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.78, "acc_stderr": 0.04163331998932261, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932261 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5702127659574469, "acc_stderr": 0.03236214467715564, "acc_norm": 0.5702127659574469, "acc_norm_stderr": 0.03236214467715564 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.04702880432049615, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5379310344827586, "acc_stderr": 0.04154659671707548, "acc_norm": 0.5379310344827586, "acc_norm_stderr": 0.04154659671707548 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.41798941798941797, "acc_stderr": 0.02540255550326091, "acc_norm": 0.41798941798941797, "acc_norm_stderr": 0.02540255550326091 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4444444444444444, "acc_stderr": 0.044444444444444495, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.044444444444444495 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7806451612903226, "acc_stderr": 0.023540799358723295, "acc_norm": 0.7806451612903226, "acc_norm_stderr": 0.023540799358723295 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.0351760354036101, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.0351760354036101 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7929292929292929, "acc_stderr": 0.028869778460267042, "acc_norm": 0.7929292929292929, "acc_norm_stderr": 0.028869778460267042 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9119170984455959, "acc_stderr": 0.02045374660160103, "acc_norm": 0.9119170984455959, "acc_norm_stderr": 0.02045374660160103 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6512820512820513, "acc_stderr": 0.02416278028401772, "acc_norm": 0.6512820512820513, "acc_norm_stderr": 0.02416278028401772 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.34814814814814815, "acc_stderr": 0.029045600290616255, "acc_norm": 0.34814814814814815, "acc_norm_stderr": 0.029045600290616255 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.70



