trillionlabs/rbridge-gemini-2.5-pro
收藏Hugging Face2025-12-02 更新2025-12-20 收录
下载链接:
https://hf-mirror.com/datasets/trillionlabs/rbridge-gemini-2.5-pro
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: ChemBench
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: canary
dtype: string
- name: description
dtype: string
- name: examples
list:
- name: input
dtype: string
- name: target
dtype: string
- name: target_scores
dtype: string
- name: in_humansubset_w_tool
dtype: bool
- name: in_humansubset_wo_tool
dtype: bool
- name: input
dtype: string
- name: keywords
sequence: string
- name: metrics
sequence: string
- name: name
dtype: string
- name: output
dtype: string
- name: preferred_score
dtype: string
- name: subfield
dtype: string
- name: text
dtype: string
- name: uuid
dtype: string
splits:
- name: test
num_bytes: 3995683
num_examples: 815
download_size: 1864140
dataset_size: 3995683
- config_name: MolLangBench
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: canary
dtype: string
- name: edit_instructions
dtype: string
- name: edited_image
struct:
- name: bytes
dtype: binary
- name: path
dtype: 'null'
- name: edited_smiles
dtype: string
- name: image
struct:
- name: bytes
dtype: binary
- name: path
dtype: 'null'
- name: input
dtype: string
- name: note
dtype: string
- name: original_image
struct:
- name: bytes
dtype: binary
- name: path
dtype: 'null'
- name: original_smiles
dtype: string
- name: output
dtype: string
- name: result_1
dtype: string
- name: result_2
dtype: string
- name: smiles
dtype: string
- name: structure_description
dtype: string
- name: target_atoms
dtype: string
- name: task
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 18512795
num_examples: 300
download_size: 17298124
dataset_size: 18512795
- config_name: MultiMedQA
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: choice_type
dtype: string
- name: cop
dtype: int64
- name: data
struct:
- name: Context
sequence: string
- name: Correct Answer
dtype: string
- name: Correct Option
dtype: string
- name: Long Answer
dtype: string
- name: Options
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: D
dtype: string
- name: Question
dtype: string
- name: exp
dtype: string
- name: id
dtype: string
- name: input
dtype: string
- name: opa
dtype: string
- name: opb
dtype: string
- name: opc
dtype: string
- name: opd
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject_name
dtype: string
- name: text
dtype: string
- name: topic_name
dtype: string
splits:
- name: test
num_bytes: 4501404
num_examples: 881
download_size: 2487110
dataset_size: 4501404
- config_name: agieval-math
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: query
dtype: string
splits:
- name: test
num_bytes: 2885807
num_examples: 1000
download_size: 1398082
dataset_size: 2885807
- config_name: aime25
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: id
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: problem
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 119619
num_examples: 30
download_size: 88534
dataset_size: 119619
- config_name: arc-ARC-Challenge
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answerKey
dtype: string
- name: choices
struct:
- name: label
sequence: string
- name: text
sequence: string
- name: id
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 3953655
num_examples: 1172
download_size: 2091178
dataset_size: 3953655
- config_name: arc-ARC-Easy
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answerKey
dtype: string
- name: choices
struct:
- name: label
sequence: string
- name: text
sequence: string
- name: id
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 7462970
num_examples: 2376
download_size: 3908253
dataset_size: 7462970
- config_name: arena-hard-v0.1-gpt-4o-2024-08-06
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: model
dtype: string
- name: output
dtype: string
splits:
- name: test
num_bytes: 5098865
num_examples: 500
download_size: 2797175
dataset_size: 5098865
- config_name: arena-hard-v2-gemini-2.5
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: model
dtype: string
- name: output
dtype: string
splits:
- name: test
num_bytes: 11597429
num_examples: 750
download_size: 6489275
dataset_size: 11597429
- config_name: attributionbench
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: attribution_label
dtype: string
- name: citation_links
sequence: string
- name: claim
dtype: string
- name: claim_raw_string
dtype: string
- name: id
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: references
sequence: string
- name: response
dtype: string
- name: src_dataset
dtype: string
- name: text
dtype: string
- name: webpage_references
sequence: 'null'
splits:
- name: test
num_bytes: 19639947
num_examples: 1606
download_size: 9863221
dataset_size: 19639947
- config_name: bbh-boolean_expressions
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 336040
num_examples: 250
download_size: 103827
dataset_size: 336040
- config_name: bbh-causal_judgement
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 672661
num_examples: 185
download_size: 279505
dataset_size: 672661
- config_name: bbh-date_understanding
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 418205
num_examples: 250
download_size: 166662
dataset_size: 418205
- config_name: bbh-disambiguation_qa
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 524844
num_examples: 250
download_size: 181841
dataset_size: 524844
- config_name: bbh-dyck_languages
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 550477
num_examples: 250
download_size: 192577
dataset_size: 550477
- config_name: bbh-formal_fallacies
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 1270454
num_examples: 250
download_size: 435320
dataset_size: 1270454
- config_name: bbh-geometric_shapes
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 730980
num_examples: 250
download_size: 292147
dataset_size: 730980
- config_name: bbh-hyperbaton
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 489981
num_examples: 250
download_size: 171065
dataset_size: 489981
- config_name: bbh-logical_deduction_five_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 878413
num_examples: 250
download_size: 283956
dataset_size: 878413
- config_name: bbh-logical_deduction_seven_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 1045248
num_examples: 250
download_size: 339275
dataset_size: 1045248
- config_name: bbh-logical_deduction_three_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 580741
num_examples: 250
download_size: 176442
dataset_size: 580741
- config_name: bbh-movie_recommendation
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 843841
num_examples: 250
download_size: 408743
dataset_size: 843841
- config_name: bbh-multistep_arithmetic_two
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 410150
num_examples: 250
download_size: 149130
dataset_size: 410150
- config_name: bbh-navigate
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 470680
num_examples: 250
download_size: 162179
dataset_size: 470680
- config_name: bbh-object_counting
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 282951
num_examples: 250
download_size: 108262
dataset_size: 282951
- config_name: bbh-penguins_in_a_table
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 322459
num_examples: 146
download_size: 93840
dataset_size: 322459
- config_name: bbh-reasoning_about_colored_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 433556
num_examples: 250
download_size: 138410
dataset_size: 433556
- config_name: bbh-ruin_names
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 546616
num_examples: 250
download_size: 242320
dataset_size: 546616
- config_name: bbh-salient_translation_error_detection
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 974857
num_examples: 250
download_size: 291597
dataset_size: 974857
- config_name: bbh-snarks
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 451997
num_examples: 178
download_size: 214551
dataset_size: 451997
- config_name: bbh-sports_understanding
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 727987
num_examples: 248
download_size: 365995
dataset_size: 727987
- config_name: bbh-temporal_sequences
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 752754
num_examples: 250
download_size: 251965
dataset_size: 752754
- config_name: bbh-tracking_shuffled_objects_five_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 805465
num_examples: 250
download_size: 227521
dataset_size: 805465
- config_name: bbh-tracking_shuffled_objects_seven_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 983969
num_examples: 250
download_size: 290973
dataset_size: 983969
- config_name: bbh-tracking_shuffled_objects_three_objects
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 612414
num_examples: 250
download_size: 166319
dataset_size: 612414
- config_name: bbh-web_of_lies
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 434155
num_examples: 250
download_size: 140911
dataset_size: 434155
- config_name: bbh-word_sorting
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: input
dtype: string
- name: output
dtype: string
- name: target
dtype: string
splits:
- name: test
num_bytes: 593714
num_examples: 250
download_size: 296481
dataset_size: 593714
- config_name: cqa
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answerKey
dtype: string
- name: choices
struct:
- name: label
sequence: string
- name: text
sequence: string
- name: id
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: question_concept
dtype: string
splits:
- name: test
num_bytes: 23329003
num_examples: 9741
download_size: 12617010
dataset_size: 23329003
- config_name: csatqa
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: Category
dtype: string
- name: Human_Peformance
dtype: float64
- name: context
dtype: string
- name: gold
dtype: int64
- name: input
dtype: string
- name: option#1
dtype: string
- name: option#2
dtype: string
- name: option#3
dtype: string
- name: option#4
dtype: string
- name: option#5
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: question_number
dtype: int64
- name: test_name
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 19170173
num_examples: 912
download_size: 9588361
dataset_size: 19170173
- config_name: drop
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answers_spans
struct:
- name: spans
sequence: string
- name: types
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: passage
dtype: string
- name: query_id
dtype: string
- name: question
dtype: string
- name: section_id
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 51733097
num_examples: 8687
download_size: 23639510
dataset_size: 51733097
- config_name: global-mmlu-ja
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: country
dtype: string
- name: cultural_sensitivity_label
dtype: string
- name: culture
dtype: string
- name: input
dtype: string
- name: is_annotated
dtype: bool
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: reference
dtype: string
- name: region
dtype: string
- name: required_knowledge
dtype: string
- name: sample_id
dtype: string
- name: subject
dtype: string
- name: subject_category
dtype: string
- name: time_sensitive
dtype: string
splits:
- name: test
num_bytes: 66564951
num_examples: 13944
download_size: 33994057
dataset_size: 66564951
- config_name: global-mmlu-ko
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: country
dtype: string
- name: cultural_sensitivity_label
dtype: string
- name: culture
dtype: string
- name: input
dtype: string
- name: is_annotated
dtype: bool
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: reference
dtype: string
- name: region
dtype: string
- name: required_knowledge
dtype: string
- name: sample_id
dtype: string
- name: subject
dtype: string
- name: subject_category
dtype: string
- name: time_sensitive
dtype: string
splits:
- name: test
num_bytes: 61046246
num_examples: 13952
download_size: 32116787
dataset_size: 61046246
- config_name: gpqa
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: Canary String
dtype: string
- name: Correct Answer
dtype: string
- name: Expert Validator Accuracy
dtype: float64
- name: Expert Validator Disagreement Category
dtype: float64
- name: Expert Validator_EV_1
dtype: string
- name: Expert Validator_EV_2
dtype: string
- name: Explanation
dtype: string
- name: Explanation_NEV_1
dtype: string
- name: Explanation_NEV_2
dtype: string
- name: Explanation_NEV_3
dtype: string
- name: Extra Revised Correct Answer
dtype: string
- name: Extra Revised Explanation
dtype: string
- name: Extra Revised Incorrect Answer 1
dtype: string
- name: Extra Revised Incorrect Answer 2
dtype: string
- name: Extra Revised Incorrect Answer 3
dtype: string
- name: Extra Revised Question
dtype: string
- name: Feedback_EV_1
dtype: string
- name: Feedback_EV_2
dtype: string
- name: Feedback_NEV_1
dtype: string
- name: Feedback_NEV_2
dtype: string
- name: Feedback_NEV_3
dtype: string
- name: High-level domain
dtype: string
- name: Incorrect Answer 1
dtype: string
- name: Incorrect Answer 2
dtype: string
- name: Incorrect Answer 3
dtype: string
- name: Is First Validation_EV_1
dtype: bool
- name: Is First Validation_EV_2
dtype: bool
- name: Majority Non-Expert Vals Incorrect
dtype: float64
- name: Manual Correctness Adjustment_EV_1
dtype: string
- name: Manual Correctness Adjustment_EV_2
dtype: string
- name: Manual Correctness Adjustment_NEV_1
dtype: string
- name: Manual Correctness Adjustment_NEV_2
dtype: 'null'
- name: Manual Correctness Adjustment_NEV_3
dtype: 'null'
- name: Non-Expert Validator Accuracy
dtype: float64
- name: Non-Expert Validator_NEV_1
dtype: string
- name: Non-Expert Validator_NEV_2
dtype: string
- name: Non-Expert Validator_NEV_3
dtype: string
- name: Post hoc agreement_EV_1
dtype: string
- name: Post hoc agreement_EV_2
dtype: string
- name: Pre-Revision Correct Answer
dtype: string
- name: Pre-Revision Explanation
dtype: string
- name: Pre-Revision Incorrect Answer 1
dtype: string
- name: Pre-Revision Incorrect Answer 2
dtype: string
- name: Pre-Revision Incorrect Answer 3
dtype: string
- name: Pre-Revision Question
dtype: string
- name: Probability Correct_EV_1
dtype: string
- name: Probability Correct_EV_2
dtype: string
- name: Probability Correct_NEV_1
dtype: string
- name: Probability Correct_NEV_2
dtype: string
- name: Probability Correct_NEV_3
dtype: string
- name: Question
dtype: string
- name: Question Difficulty_EV_1
dtype: string
- name: Question Difficulty_EV_2
dtype: string
- name: Question Writer
dtype: string
- name: Record ID
dtype: string
- name: Revision Comments (from Question Writer)
dtype: string
- name: Self-reported question-writing time (minutes)
dtype: float64
- name: Self-reported time (minutes)_EV_1
dtype: float64
- name: Self-reported time (minutes)_EV_2
dtype: float64
- name: Self-reported time (minutes)_NEV_1
dtype: float64
- name: Self-reported time (minutes)_NEV_2
dtype: float64
- name: Self-reported time (minutes)_NEV_3
dtype: float64
- name: Subdomain
dtype: string
- name: Sufficient Expertise?_EV_1
dtype: string
- name: Sufficient Expertise?_EV_2
dtype: string
- name: Understand the question?_EV_1
dtype: string
- name: Understand the question?_EV_2
dtype: string
- name: Validator Answered Correctly_EV_1
dtype: int64
- name: Validator Answered Correctly_EV_2
dtype: int64
- name: Validator Answered Correctly_NEV_1
dtype: int64
- name: Validator Answered Correctly_NEV_2
dtype: int64
- name: Validator Answered Correctly_NEV_3
dtype: float64
- name: Validator Revision Suggestion_EV_1
dtype: string
- name: Validator Revision Suggestion_EV_2
dtype: string
- name: Websites visited_NEV_1
dtype: string
- name: Websites visited_NEV_2
dtype: string
- name: Websites visited_NEV_3
dtype: string
- name: Writer's Difficulty Estimate
dtype: string
- name: input
dtype: string
- name: output
dtype: string
splits:
- name: test
num_bytes: 5322001
num_examples: 448
download_size: 2869070
dataset_size: 5322001
- config_name: gsm8k
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 4460071
num_examples: 1319
download_size: 2291622
dataset_size: 4460071
- config_name: haerae-general_knowledge
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: a
dtype: string
- name: answer
dtype: string
- name: b
dtype: string
- name: c
dtype: string
- name: d
dtype: string
- name: e
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: split
dtype: string
splits:
- name: test
num_bytes: 419010
num_examples: 176
download_size: 236907
dataset_size: 419010
- config_name: haerae-history
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: a
dtype: string
- name: answer
dtype: string
- name: b
dtype: string
- name: c
dtype: string
- name: d
dtype: string
- name: e
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: split
dtype: string
splits:
- name: test
num_bytes: 462858
num_examples: 188
download_size: 265640
dataset_size: 462858
- config_name: haerae-loan_words
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: a
dtype: string
- name: answer
dtype: string
- name: b
dtype: string
- name: c
dtype: string
- name: d
dtype: string
- name: e
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: split
dtype: string
splits:
- name: test
num_bytes: 342333
num_examples: 169
download_size: 171776
dataset_size: 342333
- config_name: haerae-rare_words
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: a
dtype: string
- name: answer
dtype: string
- name: b
dtype: string
- name: c
dtype: string
- name: d
dtype: string
- name: e
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: split
dtype: string
splits:
- name: test
num_bytes: 873146
num_examples: 405
download_size: 422804
dataset_size: 873146
- config_name: haerae-reading_comprehension
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: a
dtype: string
- name: answer
dtype: string
- name: b
dtype: string
- name: c
dtype: string
- name: d
dtype: string
- name: e
dtype: 'null'
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: split
dtype: string
splits:
- name: test
num_bytes: 2351143
num_examples: 447
download_size: 1257144
dataset_size: 2351143
- config_name: haerae-standard_nomenclature
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: a
dtype: string
- name: answer
dtype: string
- name: b
dtype: string
- name: c
dtype: string
- name: d
dtype: string
- name: e
dtype: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: split
dtype: string
splits:
- name: test
num_bytes: 329248
num_examples: 153
download_size: 165374
dataset_size: 329248
- config_name: hellaswag
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: activity_label
dtype: string
- name: ctx
dtype: string
- name: ctx_a
dtype: string
- name: ctx_b
dtype: string
- name: endings
sequence: string
- name: ind
dtype: int64
- name: input
dtype: string
- name: label
dtype: string
- name: output
dtype: string
- name: source_id
dtype: string
- name: split
dtype: string
- name: split_type
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 61032644
num_examples: 10042
download_size: 32943807
dataset_size: 61032644
- config_name: hle
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: answer_type
dtype: string
- name: author_name
dtype: string
- name: canary
dtype: string
- name: category
dtype: string
- name: id
dtype: string
- name: image
dtype: string
- name: image_preview
dtype: 'null'
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: rationale
dtype: string
- name: rationale_image
struct:
- name: bytes
dtype: binary
- name: path
dtype: 'null'
- name: raw_subject
dtype: string
splits:
- name: test
num_bytes: 81207862
num_examples: 2158
download_size: 73403422
dataset_size: 81207862
- config_name: hrm8k
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: float64
- name: input
dtype: string
- name: original
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: text
dtype: string
splits:
- name: test
num_bytes: 3912180
num_examples: 1319
download_size: 1874921
dataset_size: 3912180
- config_name: include-Albanian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1598766
num_examples: 551
download_size: 869505
dataset_size: 1598766
- config_name: include-Arabic
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1951446
num_examples: 552
download_size: 927489
dataset_size: 1951446
- config_name: include-Armenian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2536034
num_examples: 549
download_size: 1100550
dataset_size: 2536034
- config_name: include-Azerbaijani
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1690162
num_examples: 539
download_size: 866199
dataset_size: 1690162
- config_name: include-Basque
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1752763
num_examples: 500
download_size: 915919
dataset_size: 1752763
- config_name: include-Belarusian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2507106
num_examples: 550
download_size: 1111869
dataset_size: 2507106
- config_name: include-Bengali
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2176427
num_examples: 548
download_size: 810619
dataset_size: 2176427
- config_name: include-Bulgarian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: 'null'
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2385630
num_examples: 549
download_size: 1065330
dataset_size: 2385630
- config_name: include-Chinese
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1735002
num_examples: 545
download_size: 1056807
dataset_size: 1735002
- config_name: include-Croatian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: 'null'
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1503744
num_examples: 550
download_size: 890790
dataset_size: 1503744
- config_name: include-Dutch
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1820943
num_examples: 551
download_size: 986082
dataset_size: 1820943
- config_name: include-Estonian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 454304
num_examples: 224
download_size: 249177
dataset_size: 454304
- config_name: include-Finnish
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2293226
num_examples: 551
download_size: 1275375
dataset_size: 2293226
- config_name: include-French
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1157701
num_examples: 419
download_size: 632687
dataset_size: 1157701
- config_name: include-Georgian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2968205
num_examples: 500
download_size: 1005534
dataset_size: 2968205
- config_name: include-German
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 484617
num_examples: 138
download_size: 284525
dataset_size: 484617
- config_name: include-Greek
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 3110518
num_examples: 514
download_size: 1380450
dataset_size: 3110518
- config_name: include-Hebrew
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2262966
num_examples: 550
download_size: 1066116
dataset_size: 2262966
- config_name: include-Hindi
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 3002660
num_examples: 547
download_size: 1113819
dataset_size: 3002660
- config_name: include-Hungarian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: 'null'
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1538358
num_examples: 550
download_size: 852968
dataset_size: 1538358
- config_name: include-Indonesian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1713007
num_examples: 550
download_size: 872297
dataset_size: 1713007
- config_name: include-Italian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2281324
num_examples: 548
download_size: 1241256
dataset_size: 2281324
- config_name: include-Japanese
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2266537
num_examples: 501
download_size: 1202897
dataset_size: 2266537
- config_name: include-Kazakh
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1700882
num_examples: 500
download_size: 783273
dataset_size: 1700882
- config_name: include-Korean
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1909363
num_examples: 500
download_size: 951981
dataset_size: 1909363
- config_name: include-Lithuanian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1692160
num_examples: 534
download_size: 954835
dataset_size: 1692160
- config_name: include-Malay
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1525909
num_examples: 501
download_size: 758036
dataset_size: 1525909
- config_name: include-Malayalam
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2180381
num_examples: 479
download_size: 802336
dataset_size: 2180381
- config_name: include-Nepali
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2095256
num_examples: 500
download_size: 770698
dataset_size: 2095256
- config_name: include-North_Macedonian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: 'null'
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2551557
num_examples: 551
download_size: 1120617
dataset_size: 2551557
- config_name: include-Persian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2671037
num_examples: 548
download_size: 1243503
dataset_size: 2671037
- config_name: include-Polish
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1606772
num_examples: 548
download_size: 966439
dataset_size: 1606772
- config_name: include-Portuguese
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 3202343
num_examples: 551
download_size: 1769359
dataset_size: 3202343
- config_name: include-Russian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2921687
num_examples: 551
download_size: 1297197
dataset_size: 2921687
- config_name: include-Serbian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: 'null'
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2450730
num_examples: 550
download_size: 1206383
dataset_size: 2450730
- config_name: include-Spanish
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1765544
num_examples: 550
download_size: 963755
dataset_size: 1765544
- config_name: include-Tagalog
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1187141
num_examples: 500
download_size: 610058
dataset_size: 1187141
- config_name: include-Tamil
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2484453
num_examples: 550
download_size: 871985
dataset_size: 2484453
- config_name: include-Telugu
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2724407
num_examples: 548
download_size: 1048659
dataset_size: 2724407
- config_name: include-Turkish
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1956065
num_examples: 548
download_size: 1095535
dataset_size: 1956065
- config_name: include-Ukrainian
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2945260
num_examples: 550
download_size: 1264934
dataset_size: 2945260
- config_name: include-Urdu
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 905425
num_examples: 345
download_size: 439215
dataset_size: 905425
- config_name: include-Uzbek
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1402833
num_examples: 540
download_size: 765528
dataset_size: 1402833
- config_name: include-Vietnamese
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: country
dtype: string
- name: domain
dtype: string
- name: input
dtype: string
- name: language
dtype: string
- name: level
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: regional_feature
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2253950
num_examples: 550
download_size: 1033787
dataset_size: 2253950
- config_name: kmmlu-Accounting
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 455825
num_examples: 100
download_size: 230615
dataset_size: 455825
- config_name: kmmlu-Agricultural-Sciences
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 357250
num_examples: 100
download_size: 206916
dataset_size: 357250
- config_name: kmmlu-Aviation-Engineering-and-Maintenance
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 322664
num_examples: 100
download_size: 189804
dataset_size: 322664
- config_name: kmmlu-Biology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 307018
num_examples: 100
download_size: 184839
dataset_size: 307018
- config_name: kmmlu-Chemical-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 344148
num_examples: 100
download_size: 199221
dataset_size: 344148
- config_name: kmmlu-Chemistry
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 329541
num_examples: 100
download_size: 187613
dataset_size: 329541
- config_name: kmmlu-Civil-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 330934
num_examples: 100
download_size: 194608
dataset_size: 330934
- config_name: kmmlu-Computer-Science
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 317539
num_examples: 100
download_size: 182861
dataset_size: 317539
- config_name: kmmlu-Construction
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 316910
num_examples: 100
download_size: 190440
dataset_size: 316910
- config_name: kmmlu-Criminal-Law
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 604667
num_examples: 100
download_size: 331247
dataset_size: 604667
- config_name: kmmlu-Ecology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 335302
num_examples: 100
download_size: 198746
dataset_size: 335302
- config_name: kmmlu-Economics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 385275
num_examples: 100
download_size: 196469
dataset_size: 385275
- config_name: kmmlu-Education
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 415359
num_examples: 100
download_size: 240199
dataset_size: 415359
- config_name: kmmlu-Electrical-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 299304
num_examples: 100
download_size: 167699
dataset_size: 299304
- config_name: kmmlu-Electronics-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 336356
num_examples: 100
download_size: 185918
dataset_size: 336356
- config_name: kmmlu-Energy-Management
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 308471
num_examples: 100
download_size: 177754
dataset_size: 308471
- config_name: kmmlu-Environmental-Science
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 351108
num_examples: 100
download_size: 200202
dataset_size: 351108
- config_name: kmmlu-Fashion
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 317190
num_examples: 100
download_size: 189764
dataset_size: 317190
- config_name: kmmlu-Food-Processing
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 331478
num_examples: 100
download_size: 191062
dataset_size: 331478
- config_name: kmmlu-Gas-Technology-and-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 323628
num_examples: 100
download_size: 188184
dataset_size: 323628
- config_name: kmmlu-Geomatics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 284953
num_examples: 100
download_size: 166859
dataset_size: 284953
- config_name: kmmlu-Health
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 370197
num_examples: 100
download_size: 217818
dataset_size: 370197
- config_name: kmmlu-Industrial-Engineer
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 315007
num_examples: 100
download_size: 187618
dataset_size: 315007
- config_name: kmmlu-Information-Technology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 312736
num_examples: 100
download_size: 182321
dataset_size: 312736
- config_name: kmmlu-Interior-Architecture-and-Design
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 311197
num_examples: 100
download_size: 182970
dataset_size: 311197
- config_name: kmmlu-Korean-History
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: 'null'
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 359045
num_examples: 86
download_size: 210930
dataset_size: 359045
- config_name: kmmlu-Law
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 465800
num_examples: 100
download_size: 244301
dataset_size: 465800
- config_name: kmmlu-Machine-Design-and-Manufacturing
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 310370
num_examples: 100
download_size: 178003
dataset_size: 310370
- config_name: kmmlu-Management
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 409094
num_examples: 100
download_size: 232225
dataset_size: 409094
- config_name: kmmlu-Maritime-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 294984
num_examples: 100
download_size: 170507
dataset_size: 294984
- config_name: kmmlu-Marketing
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 359626
num_examples: 100
download_size: 209314
dataset_size: 359626
- config_name: kmmlu-Materials-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 326734
num_examples: 100
download_size: 181762
dataset_size: 326734
- config_name: kmmlu-Math
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 257499
num_examples: 100
download_size: 133624
dataset_size: 257499
- config_name: kmmlu-Mechanical-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 306944
num_examples: 100
download_size: 175846
dataset_size: 306944
- config_name: kmmlu-Nondestructive-Testing
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 342891
num_examples: 100
download_size: 198193
dataset_size: 342891
- config_name: kmmlu-Patent
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 663588
num_examples: 100
download_size: 338462
dataset_size: 663588
- config_name: kmmlu-Political-Science-and-Sociology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 442042
num_examples: 100
download_size: 253716
dataset_size: 442042
- config_name: kmmlu-Psychology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 398653
num_examples: 100
download_size: 228320
dataset_size: 398653
- config_name: kmmlu-Public-Safety
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 316026
num_examples: 100
download_size: 186026
dataset_size: 316026
- config_name: kmmlu-Railway-and-Automotive-Engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 304652
num_examples: 100
download_size: 177866
dataset_size: 304652
- config_name: kmmlu-Real-Estate
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 481098
num_examples: 100
download_size: 246558
dataset_size: 481098
- config_name: kmmlu-Refrigerating-Machinery
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 326106
num_examples: 100
download_size: 183320
dataset_size: 326106
- config_name: kmmlu-Social-Welfare
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 347782
num_examples: 100
download_size: 195167
dataset_size: 347782
- config_name: kmmlu-Taxation
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 559924
num_examples: 100
download_size: 286171
dataset_size: 559924
- config_name: kmmlu-Telecommunications-and-Wireless-Technology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: Category
dtype: string
- name: D
dtype: string
- name: Human Accuracy
dtype: float64
- name: answer
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
splits:
- name: test
num_bytes: 313232
num_examples: 100
download_size: 179254
dataset_size: 313232
- config_name: kobest
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: alternative_1
dtype: string
- name: alternative_2
dtype: string
- name: context
dtype: string
- name: context_1
dtype: string
- name: context_2
dtype: string
- name: ending_1
dtype: string
- name: ending_2
dtype: string
- name: ending_3
dtype: string
- name: ending_4
dtype: string
- name: input
dtype: string
- name: label
dtype: int64
- name: output
dtype: string
- name: paragraph
dtype: string
- name: premise
dtype: string
- name: question
dtype: string
- name: sentence
dtype: string
- name: text
dtype: string
- name: word
dtype: string
splits:
- name: test
num_bytes: 1454712
num_examples: 500
download_size: 766558
dataset_size: 1454712
- config_name: math500
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: input
dtype: string
- name: level
dtype: int64
- name: output
dtype: string
- name: problem
dtype: string
- name: solution
dtype: string
- name: subject
dtype: string
- name: text
dtype: string
- name: unique_id
dtype: string
splits:
- name: test
num_bytes: 1936110
num_examples: 500
download_size: 971988
dataset_size: 1936110
- config_name: mixeval-freeform
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: benchmark_name
dtype: string
- name: context
dtype: string
- name: id
dtype: string
- name: input
dtype: string
- name: options
sequence: string
- name: output
dtype: string
- name: problem_type
dtype: string
- name: prompt
dtype: string
- name: target
sequence: string
splits:
- name: test
num_bytes: 4423969
num_examples: 1446
download_size: 2498527
dataset_size: 4423969
- config_name: mixeval-multiplechoice
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: benchmark_name
dtype: string
- name: context
dtype: string
- name: id
dtype: string
- name: input
dtype: string
- name: options
sequence: string
- name: output
dtype: string
- name: problem_type
dtype: string
- name: prompt
dtype: string
- name: target
sequence: string
splits:
- name: test
num_bytes: 7260217
num_examples: 1739
download_size: 4072278
dataset_size: 7260217
- config_name: mmlu-abstract_algebra
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 396514
num_examples: 100
download_size: 186316
dataset_size: 396514
- config_name: mmlu-anatomy
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 438319
num_examples: 135
download_size: 224230
dataset_size: 438319
- config_name: mmlu-astronomy
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 585054
num_examples: 152
download_size: 319803
dataset_size: 585054
- config_name: mmlu-business_ethics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 392772
num_examples: 100
download_size: 213525
dataset_size: 392772
- config_name: mmlu-clinical_knowledge
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 944599
num_examples: 265
download_size: 512548
dataset_size: 944599
- config_name: mmlu-college_biology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 584846
num_examples: 144
download_size: 320550
dataset_size: 584846
- config_name: mmlu-college_chemistry
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 374156
num_examples: 100
download_size: 207141
dataset_size: 374156
- config_name: mmlu-college_computer_science
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 464743
num_examples: 100
download_size: 250759
dataset_size: 464743
- config_name: mmlu-college_mathematics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 412766
num_examples: 100
download_size: 217900
dataset_size: 412766
- config_name: mmlu-college_medicine
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 771462
num_examples: 173
download_size: 399091
dataset_size: 771462
- config_name: mmlu-college_physics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 334998
num_examples: 91
download_size: 180796
dataset_size: 334998
- config_name: mmlu-computer_security
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 367697
num_examples: 100
download_size: 209636
dataset_size: 367697
- config_name: mmlu-conceptual_physics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 726112
num_examples: 235
download_size: 365548
dataset_size: 726112
- config_name: mmlu-econometrics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 563050
num_examples: 114
download_size: 278069
dataset_size: 563050
- config_name: mmlu-electrical_engineering
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 465299
num_examples: 145
download_size: 248014
dataset_size: 465299
- config_name: mmlu-elementary_mathematics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 792039
num_examples: 377
download_size: 388268
dataset_size: 792039
- config_name: mmlu-formal_logic
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 530858
num_examples: 126
download_size: 221186
dataset_size: 530858
- config_name: mmlu-global_facts
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 260923
num_examples: 100
download_size: 142236
dataset_size: 260923
- config_name: mmlu-high_school_biology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1274666
num_examples: 310
download_size: 660783
dataset_size: 1274666
- config_name: mmlu-high_school_chemistry
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 792464
num_examples: 203
download_size: 395083
dataset_size: 792464
- config_name: mmlu-high_school_computer_science
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 393538
num_examples: 100
download_size: 211337
dataset_size: 393538
- config_name: mmlu-high_school_european_history
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1336493
num_examples: 165
download_size: 716685
dataset_size: 1336493
- config_name: mmlu-high_school_geography
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 637247
num_examples: 198
download_size: 349021
dataset_size: 637247
- config_name: mmlu-high_school_government_and_politics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 781581
num_examples: 193
download_size: 407738
dataset_size: 781581
- config_name: mmlu-high_school_macroeconomics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1578320
num_examples: 390
download_size: 733264
dataset_size: 1578320
- config_name: mmlu-high_school_mathematics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 800491
num_examples: 270
download_size: 396222
dataset_size: 800491
- config_name: mmlu-high_school_microeconomics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1012358
num_examples: 238
download_size: 480599
dataset_size: 1012358
- config_name: mmlu-high_school_physics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 632931
num_examples: 151
download_size: 315824
dataset_size: 632931
- config_name: mmlu-high_school_psychology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1939848
num_examples: 534
download_size: 1034794
dataset_size: 1939848
- config_name: mmlu-high_school_statistics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 983678
num_examples: 216
download_size: 482350
dataset_size: 983678
- config_name: mmlu-high_school_us_history
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1567131
num_examples: 204
download_size: 839557
dataset_size: 1567131
- config_name: mmlu-high_school_world_history
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1899496
num_examples: 237
download_size: 1032261
dataset_size: 1899496
- config_name: mmlu-human_aging
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 743431
num_examples: 223
download_size: 416261
dataset_size: 743431
- config_name: mmlu-human_sexuality
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 442853
num_examples: 131
download_size: 254679
dataset_size: 442853
- config_name: mmlu-international_law
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 599331
num_examples: 121
download_size: 312198
dataset_size: 599331
- config_name: mmlu-jurisprudence
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 487437
num_examples: 108
download_size: 267604
dataset_size: 487437
- config_name: mmlu-logical_fallacies
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 570790
num_examples: 163
download_size: 275460
dataset_size: 570790
- config_name: mmlu-machine_learning
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 505413
num_examples: 112
download_size: 263505
dataset_size: 505413
- config_name: mmlu-management
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 331324
num_examples: 103
download_size: 187277
dataset_size: 331324
- config_name: mmlu-marketing
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 752045
num_examples: 234
download_size: 397928
dataset_size: 752045
- config_name: mmlu-medical_genetics
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 340549
num_examples: 100
download_size: 189314
dataset_size: 340549
- config_name: mmlu-miscellaneous
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1950260
num_examples: 783
download_size: 1109892
dataset_size: 1950260
- config_name: mmlu-moral_disputes
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1369187
num_examples: 346
download_size: 724291
dataset_size: 1369187
- config_name: mmlu-moral_scenarios
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 3175519
num_examples: 895
download_size: 1284130
dataset_size: 3175519
- config_name: mmlu-nutrition
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1250528
num_examples: 306
download_size: 666736
dataset_size: 1250528
- config_name: mmlu-philosophy
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1196367
num_examples: 311
download_size: 647847
dataset_size: 1196367
- config_name: mmlu-prehistory
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1176086
num_examples: 324
download_size: 642502
dataset_size: 1176086
- config_name: mmlu-pro
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: string
- name: answer_index
dtype: int64
- name: category
dtype: string
- name: cot_content
dtype: string
- name: input
dtype: string
- name: options
sequence: string
- name: output
dtype: string
- name: question
dtype: string
- name: question_id
dtype: int64
- name: src
dtype: string
splits:
- name: test
num_bytes: 62433547
num_examples: 12032
download_size: 33078741
dataset_size: 62433547
- config_name: mmlu-professional_accounting
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1187699
num_examples: 282
download_size: 601601
dataset_size: 1187699
- config_name: mmlu-professional_law
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 11841622
num_examples: 1534
download_size: 6076477
dataset_size: 11841622
- config_name: mmlu-professional_medicine
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1512318
num_examples: 272
download_size: 835103
dataset_size: 1512318
- config_name: mmlu-professional_psychology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 2577687
num_examples: 611
download_size: 1366336
dataset_size: 2577687
- config_name: mmlu-public_relations
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 372143
num_examples: 108
download_size: 217327
dataset_size: 372143
- config_name: mmlu-security_studies
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 1508441
num_examples: 245
download_size: 800391
dataset_size: 1508441
- config_name: mmlu-sociology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 811007
num_examples: 201
download_size: 454642
dataset_size: 811007
- config_name: mmlu-us_foreign_policy
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 403284
num_examples: 99
download_size: 233714
dataset_size: 403284
- config_name: mmlu-virology
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 623374
num_examples: 166
download_size: 352074
dataset_size: 623374
- config_name: mmlu-world_religions
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: answer
dtype: int64
- name: choices
sequence: string
- name: input
dtype: string
- name: output
dtype: string
- name: question
dtype: string
- name: subject
dtype: string
splits:
- name: test
num_bytes: 453294
num_examples: 171
download_size: 265100
dataset_size: 453294
- config_name: mmmlu-JA_JP
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: Answer
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: D
dtype: string
- name: Question
dtype: string
- name: Subject
dtype: string
- name: 'Unnamed: 0'
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
splits:
- name: test
num_bytes: 65116427
num_examples: 13944
download_size: 33791366
dataset_size: 65116427
- config_name: mmmlu-KO_KR
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: A
dtype: string
- name: Answer
dtype: string
- name: B
dtype: string
- name: C
dtype: string
- name: D
dtype: string
- name: Question
dtype: string
- name: Subject
dtype: string
- name: 'Unnamed: 0'
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
splits:
- name: test
num_bytes: 59596704
num_examples: 13952
download_size: 31912422
dataset_size: 59596704
- config_name: rewardbench
features:
- name: question
dtype: string
- name: reasoning
dtype: string
- name: answer
dtype: string
- name: input
struct:
- name: chosen
dtype: string
- name: chosen_model
dtype: string
- name: id
dtype: int64
- name: input
dtype: string
- name: output
dtype: string
- name: prompt
dtype: string
- name: rejected
dtype: string
- name: rejected_model
dtype: string
- name: subset
dtype: string
splits:
- name: test
num_bytes: 24583412
num_examples: 3438
download_size: 13806076
dataset_size: 24583412
configs:
- config_name: ChemBench
data_files:
- split: test
path: ChemBench/test-*
- config_name: MolLangBench
data_files:
- split: test
path: MolLangBench/test-*
- config_name: MultiMedQA
data_files:
- split: test
path: MultiMedQA/test-*
- config_name: agieval-math
data_files:
- split: test
path: agieval-math/test-*
- config_name: aime25
data_files:
- split: test
path: aime25/test-*
- config_name: arc-ARC-Challenge
data_files:
- split: test
path: arc-ARC-Challenge/test-*
- config_name: arc-ARC-Easy
data_files:
- split: test
path: arc-ARC-Easy/test-*
- config_name: arena-hard-v0.1-gpt-4o-2024-08-06
data_files:
- split: test
path: arena-hard-v0.1-gpt-4o-2024-08-06/test-*
- config_name: arena-hard-v2-gemini-2.5
data_files:
- split: test
path: arena-hard-v2-gemini-2.5/test-*
- config_name: attributionbench
data_files:
- split: test
path: attributionbench/test-*
- config_name: bbh-boolean_expressions
data_files:
- split: test
path: bbh-boolean_expressions/test-*
- config_name: bbh-causal_judgement
data_files:
- split: test
path: bbh-causal_judgement/test-*
- config_name: bbh-date_understanding
data_files:
- split: test
path: bbh-date_understanding/test-*
- config_name: bbh-disambiguation_qa
data_files:
- split: test
path: bbh-disambiguation_qa/test-*
- config_name: bbh-dyck_languages
data_files:
- split: test
path: bbh-dyck_languages/test-*
- config_name: bbh-formal_fallacies
data_files:
- split: test
path: bbh-formal_fallacies/test-*
- config_name: bbh-geometric_shapes
data_files:
- split: test
path: bbh-geometric_shapes/test-*
- config_name: bbh-hyperbaton
data_files:
- split: test
path: bbh-hyperbaton/test-*
- config_name: bbh-logical_deduction_five_objects
data_files:
- split: test
path: bbh-logical_deduction_five_objects/test-*
- config_name: bbh-logical_deduction_seven_objects
data_files:
- split: test
path: bbh-logical_deduction_seven_objects/test-*
- config_name: bbh-logical_deduction_three_objects
data_files:
- split: test
path: bbh-logical_deduction_three_objects/test-*
- config_name: bbh-movie_recommendation
data_files:
- split: test
path: bbh-movie_recommendation/test-*
- config_name: bbh-multistep_arithmetic_two
data_files:
- split: test
path: bbh-multistep_arithmetic_two/test-*
- config_name: bbh-navigate
data_files:
- split: test
path: bbh-navigate/test-*
- config_name: bbh-object_counting
data_files:
- split: test
path: bbh-object_counting/test-*
- config_name: bbh-penguins_in_a_table
data_files:
- split: test
path: bbh-penguins_in_a_table/test-*
- config_name: bbh-reasoning_about_colored_objects
data_files:
- split: test
path: bbh-reasoning_about_colored_objects/test-*
- config_name: bbh-ruin_names
data_files:
- split: test
path: bbh-ruin_names/test-*
- config_name: bbh-salient_translation_error_detection
data_files:
- split: test
path: bbh-salient_translation_error_detection/test-*
- config_name: bbh-snarks
data_files:
- split: test
path: bbh-snarks/test-*
- config_name: bbh-sports_understanding
data_files:
- split: test
path: bbh-sports_understanding/test-*
- config_name: bbh-temporal_sequences
data_files:
- split: test
path: bbh-temporal_sequences/test-*
- config_name: bbh-tracking_shuffled_objects_five_objects
data_files:
- split: test
path: bbh-tracking_shuffled_objects_five_objects/test-*
- config_name: bbh-tracking_shuffled_objects_seven_objects
data_files:
- split: test
path: bbh-tracking_shuffled_objects_seven_objects/test-*
- config_name: bbh-tracking_shuffled_objects_three_objects
data_files:
- split: test
path: bbh-tracking_shuffled_objects_three_objects/test-*
- config_name: bbh-web_of_lies
data_files:
- split: test
path: bbh-web_of_lies/test-*
- config_name: bbh-word_sorting
data_files:
- split: test
path: bbh-word_sorting/test-*
- config_name: cqa
data_files:
- split: test
path: cqa/test-*
- config_name: csatqa
data_files:
- split: test
path: csatqa/test-*
- config_name: drop
data_files:
- split: test
path: drop/test-*
- config_name: global-mmlu-ja
data_files:
- split: test
path: global-mmlu-ja/test-*
- config_name: global-mmlu-ko
data_files:
- split: test
path: global-mmlu-ko/test-*
- config_name: gpqa
data_files:
- split: test
path: gpqa/test-*
- config_name: gsm8k
data_files:
- split: test
path: gsm8k/test-*
- config_name: haerae-general_knowledge
data_files:
- split: test
path: haerae-general_knowledge/test-*
- config_name: haerae-history
data_files:
- split: test
path: haerae-history/test-*
- config_name: haerae-loan_words
data_files:
- split: test
path: haerae-loan_words/test-*
- config_name: haerae-rare_words
data_files:
- split: test
path: haerae-rare_words/test-*
- config_name: haerae-reading_comprehension
data_files:
- split: test
path: haerae-reading_comprehension/test-*
- config_name: haerae-standard_nomenclature
data_files:
- split: test
path: haerae-standard_nomenclature/test-*
- config_name: hellaswag
data_files:
- split: test
path: hellaswag/test-*
- config_name: hle
data_files:
- split: test
path: hle/test-*
- config_name: hrm8k
data_files:
- split: test
path: hrm8k/test-*
- config_name: include-Albanian
data_files:
- split: test
path: include-Albanian/test-*
- config_name: include-Arabic
data_files:
- split: test
path: include-Arabic/test-*
- config_name: include-Armenian
data_files:
- split: test
path: include-Armenian/test-*
- config_name: include-Azerbaijani
data_files:
- split: test
path: include-Azerbaijani/test-*
- config_name: include-Basque
data_files:
- split: test
path: include-Basque/test-*
- config_name: include-Belarusian
data_files:
- split: test
path: include-Belarusian/test-*
- config_name: include-Bengali
data_files:
- split: test
path: include-Bengali/test-*
- config_name: include-Bulgarian
data_files:
- split: test
path: include-Bulgarian/test-*
- config_name: include-Chinese
data_files:
- split: test
path: include-Chinese/test-*
- config_name: include-Croatian
data_files:
- split: test
path: include-Croatian/test-*
- config_name: include-Dutch
data_files:
- split: test
path: include-Dutch/test-*
- config_name: include-Estonian
data_files:
- split: test
path: include-Estonian/test-*
- config_name: include-Finnish
data_files:
- split: test
path: include-Finnish/test-*
- config_name: include-French
data_files:
- split: test
path: include-French/test-*
- config_name: include-Georgian
data_files:
- split: test
path: include-Georgian/test-*
- config_name: include-German
data_files:
- split: test
path: include-German/test-*
- config_name: include-Greek
data_files:
- split: test
path: include-Greek/test-*
- config_name: include-Hebrew
data_files:
- split: test
path: include-Hebrew/test-*
- config_name: include-Hindi
data_files:
- split: test
path: include-Hindi/test-*
- config_name: include-Hungarian
data_files:
- split: test
path: include-Hungarian/test-*
- config_name: include-Indonesian
data_files:
- split: test
path: include-Indonesian/test-*
- config_name: include-Italian
data_files:
- split: test
path: include-Italian/test-*
- config_name: include-Japanese
data_files:
- split: test
path: include-Japanese/test-*
- config_name: include-Kazakh
data_files:
- split: test
path: include-Kazakh/test-*
- config_name: include-Korean
data_files:
- split: test
path: include-Korean/test-*
- config_name: include-Lithuanian
data_files:
- split: test
path: include-Lithuanian/test-*
- config_name: include-Malay
data_files:
- split: test
path: include-Malay/test-*
- config_name: include-Malayalam
data_files:
- split: test
path: include-Malayalam/test-*
- config_name: include-Nepali
data_files:
- split: test
path: include-Nepali/test-*
- config_name: include-North_Macedonian
data_files:
- split: test
path: include-North_Macedonian/test-*
- config_name: include-Persian
data_files:
- split: test
path: include-Persian/test-*
- config_name: include-Polish
data_files:
- split: test
path: include-Polish/test-*
- config_name: include-Portuguese
data_files:
- split: test
path: include-Portuguese/test-*
- config_name: include-Russian
data_files:
- split: test
path: include-Russian/test-*
- config_name: include-Serbian
data_files:
- split: test
path: include-Serbian/test-*
- config_name: include-Spanish
data_files:
- split: test
path: include-Spanish/test-*
- config_name: include-Tagalog
data_files:
- split: test
path: include-Tagalog/test-*
- config_name: include-Tamil
data_files:
- split: test
path: include-Tamil/test-*
- config_name: include-Telugu
data_files:
- split: test
path: include-Telugu/test-*
- config_name: include-Turkish
data_files:
- split: test
path: include-Turkish/test-*
- config_name: include-Ukrainian
data_files:
- split: test
path: include-Ukrainian/test-*
- config_name: include-Urdu
data_files:
- split: test
path: include-Urdu/test-*
- config_name: include-Uzbek
data_files:
- split: test
path: include-Uzbek/test-*
- config_name: include-Vietnamese
data_files:
- split: test
path: include-Vietnamese/test-*
- config_name: kmmlu-Accounting
data_files:
- split: test
path: kmmlu-Accounting/test-*
- config_name: kmmlu-Agricultural-Sciences
data_files:
- split: test
path: kmmlu-Agricultural-Sciences/test-*
- config_name: kmmlu-Aviation-Engineering-and-Maintenance
data_files:
- split: test
path: kmmlu-Aviation-Engineering-and-Maintenance/test-*
- config_name: kmmlu-Biology
data_files:
- split: test
path: kmmlu-Biology/test-*
- config_name: kmmlu-Chemical-Engineering
data_files:
- split: test
path: kmmlu-Chemical-Engineering/test-*
- config_name: kmmlu-Chemistry
data_files:
- split: test
path: kmmlu-Chemistry/test-*
- config_name: kmmlu-Civil-Engineering
data_files:
- split: test
path: kmmlu-Civil-Engineering/test-*
- config_name: kmmlu-Computer-Science
data_files:
- split: test
path: kmmlu-Computer-Science/test-*
- config_name: kmmlu-Construction
data_files:
- split: test
path: kmmlu-Construction/test-*
- config_name: kmmlu-Criminal-Law
data_files:
- split: test
path: kmmlu-Criminal-Law/test-*
- config_name: kmmlu-Ecology
data_files:
- split: test
path: kmmlu-Ecology/test-*
- config_name: kmmlu-Economics
data_files:
- split: test
path: kmmlu-Economics/test-*
- config_name: kmmlu-Education
data_files:
- split: test
path: kmmlu-Education/test-*
- config_name: kmmlu-Electrical-Engineering
data_files:
- split: test
path: kmmlu-Electrical-Engineering/test-*
- config_name: kmmlu-Electronics-Engineering
data_files:
- split: test
path: kmmlu-Electronics-Engineering/test-*
- config_name: kmmlu-Energy-Management
data_files:
- split: test
path: kmmlu-Energy-Management/test-*
- config_name: kmmlu-Environmental-Science
data_files:
- split: test
path: kmmlu-Environmental-Science/test-*
- config_name: kmmlu-Fashion
data_files:
- split: test
path: kmmlu-Fashion/test-*
- config_name: kmmlu-Food-Processing
data_files:
- split: test
path: kmmlu-Food-Processing/test-*
- config_name: kmmlu-Gas-Technology-and-Engineering
data_files:
- split: test
path: kmmlu-Gas-Technology-and-Engineering/test-*
- config_name: kmmlu-Geomatics
data_files:
- split: test
path: kmmlu-Geomatics/test-*
- config_name: kmmlu-Health
data_files:
- split: test
path: kmmlu-Health/test-*
- config_name: kmmlu-Industrial-Engineer
data_files:
- split: test
path: kmmlu-Industrial-Engineer/test-*
- config_name: kmmlu-Information-Technology
data_files:
- split: test
path: kmmlu-Information-Technology/test-*
- config_name: kmmlu-Interior-Architecture-and-Design
data_files:
- split: test
path: kmmlu-Interior-Architecture-and-Design/test-*
- config_name: kmmlu-Korean-History
data_files:
- split: test
path: kmmlu-Korean-History/test-*
- config_name: kmmlu-Law
data_files:
- split: test
path: kmmlu-Law/test-*
- config_name: kmmlu-Machine-Design-and-Manufacturing
data_files:
- split: test
path: kmmlu-Machine-Design-and-Manufacturing/test-*
- config_name: kmmlu-Management
data_files:
- split: test
path: kmmlu-Management/test-*
- config_name: kmmlu-Maritime-Engineering
data_files:
- split: test
path: kmmlu-Maritime-Engineering/test-*
- config_name: kmmlu-Marketing
data_files:
- split: test
path: kmmlu-Marketing/test-*
- config_name: kmmlu-Materials-Engineering
data_files:
- split: test
path: kmmlu-Materials-Engineering/test-*
- config_name: kmmlu-Math
data_files:
- split: test
path: kmmlu-Math/test-*
- config_name: kmmlu-Mechanical-Engineering
data_files:
- split: test
path: kmmlu-Mechanical-Engineering/test-*
- config_name: kmmlu-Nondestructive-Testing
data_files:
- split: test
path: kmmlu-Nondestructive-Testing/test-*
- config_name: kmmlu-Patent
data_files:
- split: test
path: kmmlu-Patent/test-*
- config_name: kmmlu-Political-Science-and-Sociology
data_files:
- split: test
path: kmmlu-Political-Science-and-Sociology/test-*
- config_name: kmmlu-Psychology
data_files:
- split: test
path: kmmlu-Psychology/test-*
- config_name: kmmlu-Public-Safety
data_files:
- split: test
path: kmmlu-Public-Safety/test-*
- config_name: kmmlu-Railway-and-Automotive-Engineering
data_files:
- split: test
path: kmmlu-Railway-and-Automotive-Engineering/test-*
- config_name: kmmlu-Real-Estate
data_files:
- split: test
path: kmmlu-Real-Estate/test-*
- config_name: kmmlu-Refrigerating-Machinery
data_files:
- split: test
path: kmmlu-Refrigerating-Machinery/test-*
- config_name: kmmlu-Social-Welfare
data_files:
- split: test
path: kmmlu-Social-Welfare/test-*
- config_name: kmmlu-Taxation
data_files:
- split: test
path: kmmlu-Taxation/test-*
- config_name: kmmlu-Telecommunications-and-Wireless-Technology
data_files:
- split: test
path: kmmlu-Telecommunications-and-Wireless-Technology/test-*
- config_name: kobest
data_files:
- split: test
path: kobest/test-*
- config_name: math500
data_files:
- split: test
path: math500/test-*
- config_name: mixeval-freeform
data_files:
- split: test
path: mixeval-freeform/test-*
- config_name: mixeval-multiplechoice
data_files:
- split: test
path: mixeval-multiplechoice/test-*
- config_name: mmlu-abstract_algebra
data_files:
- split: test
path: mmlu-abstract_algebra/test-*
- config_name: mmlu-anatomy
data_files:
- split: test
path: mmlu-anatomy/test-*
- config_name: mmlu-astronomy
data_files:
- split: test
path: mmlu-astronomy/test-*
- config_name: mmlu-business_ethics
data_files:
- split: test
path: mmlu-business_ethics/test-*
- config_name: mmlu-clinical_knowledge
data_files:
- split: test
path: mmlu-clinical_knowledge/test-*
- config_name: mmlu-college_biology
data_files:
- split: test
path: mmlu-college_biology/test-*
- config_name: mmlu-college_chemistry
data_files:
- split: test
path: mmlu-college_chemistry/test-*
- config_name: mmlu-college_computer_science
data_files:
- split: test
path: mmlu-college_computer_science/test-*
- config_name: mmlu-college_mathematics
data_files:
- split: test
path: mmlu-college_mathematics/test-*
- config_name: mmlu-college_medicine
data_files:
- split: test
path: mmlu-college_medicine/test-*
- config_name: mmlu-college_physics
data_files:
- split: test
path: mmlu-college_physics/test-*
- config_name: mmlu-computer_security
data_files:
- split: test
path: mmlu-computer_security/test-*
- config_name: mmlu-conceptual_physics
data_files:
- split: test
path: mmlu-conceptual_physics/test-*
- config_name: mmlu-econometrics
data_files:
- split: test
path: mmlu-econometrics/test-*
- config_name: mmlu-electrical_engineering
data_files:
- split: test
path: mmlu-electrical_engineering/test-*
- config_name: mmlu-elementary_mathematics
data_files:
- split: test
path: mmlu-elementary_mathematics/test-*
- config_name: mmlu-formal_logic
data_files:
- split: test
path: mmlu-formal_logic/test-*
- config_name: mmlu-global_facts
data_files:
- split: test
path: mmlu-global_facts/test-*
- config_name: mmlu-high_school_biology
data_files:
- split: test
path: mmlu-high_school_biology/test-*
- config_name: mmlu-high_school_chemistry
data_files:
- split: test
path: mmlu-high_school_chemistry/test-*
- config_name: mmlu-high_school_computer_science
data_files:
- split: test
path: mmlu-high_school_computer_science/test-*
- config_name: mmlu-high_school_european_history
data_files:
- split: test
path: mmlu-high_school_european_history/test-*
- config_name: mmlu-high_school_geography
data_files:
- split: test
path: mmlu-high_school_geography/test-*
- config_name: mmlu-high_school_government_and_politics
data_files:
- split: test
path: mmlu-high_school_government_and_politics/test-*
- config_name: mmlu-high_school_macroeconomics
data_files:
- split: test
path: mmlu-high_school_macroeconomics/test-*
- config_name: mmlu-high_school_mathematics
data_files:
- split: test
path: mmlu-high_school_mathematics/test-*
- config_name: mmlu-high_school_microeconomics
data_files:
- split: test
path: mmlu-high_school_microeconomics/test-*
- config_name: mmlu-high_school_physics
data_files:
- split: test
path: mmlu-high_school_physics/test-*
- config_name: mmlu-high_school_psychology
data_files:
- split: test
path: mmlu-high_school_psychology/test-*
- config_name: mmlu-high_school_statistics
data_files:
- split: test
path: mmlu-high_school_statistics/test-*
- config_name: mmlu-high_school_us_history
data_files:
- split: test
path: mmlu-high_school_us_history/test-*
- config_name: mmlu-high_school_world_history
data_files:
- split: test
path: mmlu-high_school_world_history/test-*
- config_name: mmlu-human_aging
data_files:
- split: test
path: mmlu-human_aging/test-*
- config_name: mmlu-human_sexuality
data_files:
- split: test
path: mmlu-human_sexuality/test-*
- config_name: mmlu-international_law
data_files:
- split: test
path: mmlu-international_law/test-*
- config_name: mmlu-jurisprudence
data_files:
- split: test
path: mmlu-jurisprudence/test-*
- config_name: mmlu-logical_fallacies
data_files:
- split: test
path: mmlu-logical_fallacies/test-*
- config_name: mmlu-machine_learning
data_files:
- split: test
path: mmlu-machine_learning/test-*
- config_name: mmlu-management
data_files:
- split: test
path: mmlu-management/test-*
- config_name: mmlu-marketing
data_files:
- split: test
path: mmlu-marketing/test-*
- config_name: mmlu-medical_genetics
data_files:
- split: test
path: mmlu-medical_genetics/test-*
- config_name: mmlu-miscellaneous
data_files:
- split: test
path: mmlu-miscellaneous/test-*
- config_name: mmlu-moral_disputes
data_files:
- split: test
path: mmlu-moral_disputes/test-*
- config_name: mmlu-moral_scenarios
data_files:
- split: test
path: mmlu-moral_scenarios/test-*
- config_name: mmlu-nutrition
data_files:
- split: test
path: mmlu-nutrition/test-*
- config_name: mmlu-philosophy
data_files:
- split: test
path: mmlu-philosophy/test-*
- config_name: mmlu-prehistory
data_files:
- split: test
path: mmlu-prehistory/test-*
- config_name: mmlu-pro
data_files:
- split: test
path: mmlu-pro/test-*
- config_name: mmlu-professional_accounting
data_files:
- split: test
path: mmlu-professional_accounting/test-*
- config_name: mmlu-professional_law
data_files:
- split: test
path: mmlu-professional_law/test-*
- config_name: mmlu-professional_medicine
data_files:
- split: test
path: mmlu-professional_medicine/test-*
- config_name: mmlu-professional_psychology
data_files:
- split: test
path: mmlu-professional_psychology/test-*
- config_name: mmlu-public_relations
data_files:
- split: test
path: mmlu-public_relations/test-*
- config_name: mmlu-security_studies
data_files:
- split: test
path: mmlu-security_studies/test-*
- config_name: mmlu-sociology
data_files:
- split: test
path: mmlu-sociology/test-*
- config_name: mmlu-us_foreign_policy
data_files:
- split: test
path: mmlu-us_foreign_policy/test-*
- config_name: mmlu-virology
data_files:
- split: test
path: mmlu-virology/test-*
- config_name: mmlu-world_religions
data_files:
- split: test
path: mmlu-world_religions/test-*
- config_name: mmmlu-JA_JP
data_files:
- split: test
path: mmmlu-JA_JP/test-*
- config_name: mmmlu-KO_KR
data_files:
- split: test
path: mmmlu-KO_KR/test-*
- config_name: rewardbench
data_files:
- split: test
path: rewardbench/test-*
---
提供机构:
trillionlabs



