open-llm-leaderboard-old/details_Replete-AI__Phi-Stoma
收藏数据集概述
该数据集是在对模型 Replete-AI/Phi-Stoma 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的拆分,拆分名称使用运行的时间戳。
- "train" 拆分始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
最新结果
以下是 2024-03-13T03:43:11.049050 运行的最新结果:
python { "all": { "acc": 0.5063780266733802, "acc_stderr": 0.034017203819050865, "acc_norm": 0.5162966582334445, "acc_norm_stderr": 0.0349384489071447, "mc1": 0.2876376988984088, "mc1_stderr": 0.01584631510139481, "mc2": 0.5204899176249441, "mc2_stderr": 0.01541175371556815 }, "harness|arc:challenge|25": { "acc": 0.4539249146757679, "acc_stderr": 0.01454922110517187, "acc_norm": 0.48464163822525597, "acc_norm_stderr": 0.014604496129394916 }, "harness|hellaswag|10": { "acc": 0.4439354710217088, "acc_stderr": 0.004958314114266497, "acc_norm": 0.602867954590719, "acc_norm_stderr": 0.004883037758919961 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.45185185185185184, "acc_stderr": 0.04299268905480864, "acc_norm": 0.45185185185185184, "acc_norm_stderr": 0.04299268905480864 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.4605263157894737, "acc_stderr": 0.04056242252249032, "acc_norm": 0.4605263157894737, "acc_norm_stderr": 0.04056242252249032 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5056603773584906, "acc_stderr": 0.030770900763851316, "acc_norm": 0.5056603773584906, "acc_norm_stderr": 0.030770900763851316 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5763888888888888, "acc_stderr": 0.04132125019723369, "acc_norm": 0.5763888888888888, "acc_norm_stderr": 0.04132125019723369 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.27, "acc_stderr": 0.04461960433384741, "acc_norm": 0.27, "acc_norm_stderr": 0.04461960433384741 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252604 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.26, "acc_stderr": 0.04408440022768079, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5086705202312138, "acc_stderr": 0.03811890988940412, "acc_norm": 0.5086705202312138, "acc_norm_stderr": 0.03811890988940412 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.28431372549019607, "acc_stderr": 0.04488482852329017, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.4425531914893617, "acc_stderr": 0.03246956919789958, "acc_norm": 0.4425531914893617, "acc_norm_stderr": 0.03246956919789958 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.3684210526315789, "acc_stderr": 0.04537815354939392, "acc_norm": 0.3684210526315789, "acc_norm_stderr": 0.04537815354939392 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.4827586206896552, "acc_stderr": 0.04164188720169377, "acc_norm": 0.4827586206896552, "acc_norm_stderr": 0.04164188720169377 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3386243386243386, "acc_stderr": 0.02437319786798306, "acc_norm": 0.3386243386243386, "acc_norm_stderr": 0.02437319786798306 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.3492063492063492, "acc_stderr": 0.04263906892795132, "acc_norm": 0.3492063492063492, "acc_norm_stderr": 0.04263906892795132 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6225806451612903, "acc_stderr": 0.02757596072327824, "acc_norm": 0.6225806451612903, "acc_norm_stderr": 0.02757596072327824 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.41379310344827586, "acc_stderr": 0.03465304488406796, "acc_norm": 0.41379310344827586, "acc_norm_stderr": 0.03465304488406796 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6727272727272727, "acc_stderr": 0.036639749943912434, "acc_norm": 0.6727272727272727, "acc_norm_stderr": 0.036639749943912434 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.6363636363636364, "acc_stderr": 0.03427308652999934, "acc_norm": 0.6363636363636364, "acc_norm_stderr": 0.03427308652999934 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.7305699481865285, "acc_stderr": 0.03201867122877794, "acc_norm": 0.7305699481865285, "acc_norm_stderr": 0.03201867122877794 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5205128205128206, "acc_stderr": 0.02532966316348994, "acc_norm": 0.5205128205128206, "acc_norm_stderr": 0.02532966316348994 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3, "acc_stderr": 0.027940457136228426, "acc_norm": 0.3, "acc_norm_stderr": 0.027940457136228426 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.5378151260504201, "acc_stderr": 0.03238546948758979, "acc_norm": 0.5378151260504201,



