tytodd/gpt-oss-20b-v2-no-gepa
收藏Hugging Face2026-04-21 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/tytodd/gpt-oss-20b-v2-no-gepa
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: aes2_essay_scoring
features:
- name: input
struct:
- name: full_text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 171141756
num_examples: 10000
- name: val
num_bytes: 16943873
num_examples: 1000
download_size: 179534277
dataset_size: 188085629
- config_name: anli_r1
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 50519515
num_examples: 10000
- name: val
num_bytes: 5277108
num_examples: 1000
download_size: 52546361
dataset_size: 55796623
- config_name: anli_r2
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 49762115
num_examples: 10000
- name: val
num_bytes: 5380873
num_examples: 1000
download_size: 51955218
dataset_size: 55142988
- config_name: anli_r3
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 108670196
num_examples: 20000
- name: val
num_bytes: 5603714
num_examples: 1000
download_size: 107777861
dataset_size: 114273910
- config_name: arc_challenge
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 4706837
num_examples: 1172
download_size: 4394565
dataset_size: 4706837
- config_name: argument_quality_ranking
features:
- name: input
struct:
- name: argument
dtype: string
- name: topic
dtype: string
- name: prediction
struct:
- name: quality_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 10906593
num_examples: 2469
download_size: 9766564
dataset_size: 10906593
- config_name: big_patent_innovation
features:
- name: input
struct:
- name: description
dtype: string
- name: prediction
struct:
- name: innovation_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 105550615
num_examples: 6892
- name: val
num_bytes: 15283678
num_examples: 1000
download_size: 113938308
dataset_size: 120834293
- config_name: boardgame_qa
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 115692125
num_examples: 10000
- name: val
num_bytes: 20095665
num_examples: 2000
download_size: 123831071
dataset_size: 135787790
- config_name: chatbot_arena_conversations
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 117516693
num_examples: 14000
- name: val
num_bytes: 22698233
num_examples: 2500
download_size: 133478295
dataset_size: 140214926
- config_name: civil_comments
features:
- name: input
struct:
- name: comment
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 43778973
num_examples: 10000
- name: val
num_bytes: 4281202
num_examples: 1000
download_size: 44594450
dataset_size: 48060175
- config_name: code_judge_bench
features:
- name: input
struct:
- name: code_A
dtype: string
- name: code_B
dtype: string
- name: problem
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 17042189
num_examples: 344
download_size: 16802378
dataset_size: 17042189
- config_name: colbert_humor_detection
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: humor_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 33780230
num_examples: 10000
- name: val
num_bytes: 3397096
num_examples: 1000
download_size: 34205802
dataset_size: 37177326
- config_name: customer_support_tickets_en
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: queue
dtype: string
- name: type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 42618153
num_examples: 5570
- name: val
num_bytes: 7772857
num_examples: 1000
download_size: 47702487
dataset_size: 50391010
- config_name: customer_support_tickets_gorkem
features:
- name: input
struct:
- name: ticket_text
dtype: string
- name: prediction
struct:
- name: ticket_subject
dtype: string
- name: ticket_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 47419669
num_examples: 6775
- name: val
num_bytes: 6999064
num_examples: 1000
download_size: 50890469
dataset_size: 54418733
- config_name: dbpedia_easy
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 53386521
num_examples: 10000
- name: val
num_bytes: 53533810
num_examples: 10000
download_size: 100825026
dataset_size: 106920331
- config_name: dbpedia_hard
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: l2_class
dtype: string
- name: l3_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 196548820
num_examples: 10000
- name: val
num_bytes: 195775338
num_examples: 10000
download_size: 384385323
dataset_size: 392324158
- config_name: dbpedia_medium
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: l2_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 83599939
num_examples: 10000
- name: val
num_bytes: 83006578
num_examples: 10000
download_size: 159957737
dataset_size: 166606517
- config_name: enron_email_quality
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: quality_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 51633021
num_examples: 6500
- name: val
num_bytes: 8027256
num_examples: 1000
download_size: 53253711
dataset_size: 59660277
- config_name: enron_email_type
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: email_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 53997725
num_examples: 7000
- name: val
num_bytes: 7783973
num_examples: 1000
download_size: 56195153
dataset_size: 61781698
- config_name: enron_reply_quality
features:
- name: input
struct:
- name: original_email
dtype: string
- name: reply
dtype: string
- name: prediction
struct:
- name: quality
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 53623763
num_examples: 7000
- name: val
num_bytes: 7603506
num_examples: 1000
download_size: 55357555
dataset_size: 61227269
- config_name: go_emotions
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: labels
list: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 51998474
num_examples: 10000
- name: val
num_bytes: 5149271
num_examples: 1000
download_size: 54074044
dataset_size: 57147745
- config_name: gpqa_diamond
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 3424259
num_examples: 198
download_size: 3335339
dataset_size: 3424259
- config_name: halueval_dialogue
features:
- name: input
struct:
- name: dialogue_history
dtype: string
- name: knowledge
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 46042748
num_examples: 8000
- name: val
num_bytes: 11550561
num_examples: 2000
download_size: 53620966
dataset_size: 57593309
- config_name: halueval_qa
features:
- name: input
struct:
- name: answer
dtype: string
- name: knowledge
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 39539594
num_examples: 8000
- name: val
num_bytes: 9945802
num_examples: 2000
download_size: 46434656
dataset_size: 49485396
- config_name: halueval_summarization
features:
- name: input
struct:
- name: document
dtype: string
- name: summary
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 159715391
num_examples: 10000
download_size: 153814994
dataset_size: 159715391
- config_name: hh_rlhf
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 169958907
num_examples: 32000
- name: val
num_bytes: 5394876
num_examples: 1000
download_size: 166262140
dataset_size: 175353783
- config_name: judge_bench
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 6465961
num_examples: 280
download_size: 6348525
dataset_size: 6465961
- config_name: lex_glue_case_hold
features:
- name: input
struct:
- name: context
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: option_e
dtype: string
- name: prediction
struct:
- name: selected_option
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 138484423
num_examples: 10000
- name: val
num_bytes: 13710903
num_examples: 1000
download_size: 147739955
dataset_size: 152195326
- config_name: lex_glue_ledgar
features:
- name: input
struct:
- name: provision_text
dtype: string
- name: prediction
struct:
- name: provision_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 93422782
num_examples: 10000
- name: val
num_bytes: 9199485
num_examples: 1000
download_size: 98807414
dataset_size: 102622267
- config_name: medical_abstracts
features:
- name: input
struct:
- name: medical_abstract
dtype: string
- name: prediction
struct:
- name: condition_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 74132694
num_examples: 10000
- name: val
num_bytes: 7423978
num_examples: 1000
download_size: 76984299
dataset_size: 81556672
- config_name: mfrc
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: annotation
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 54910525
num_examples: 10000
- name: val
num_bytes: 5097254
num_examples: 1000
download_size: 54925670
dataset_size: 60007779
- config_name: mmlu
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 157319752
num_examples: 20000
- name: val
num_bytes: 4804945
num_examples: 1000
download_size: 155941164
dataset_size: 162124697
- config_name: mmlu_pro
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 98437284
num_examples: 12032
download_size: 93938176
dataset_size: 98437284
- config_name: mt_bench_human_judgments
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 9757505
num_examples: 1000
download_size: 9171706
dataset_size: 9757505
- config_name: musr_murder_mysteries
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 4445702
num_examples: 250
download_size: 4324709
dataset_size: 4445702
- config_name: musr_object_placements
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 3879533
num_examples: 256
download_size: 3804399
dataset_size: 3879533
- config_name: musr_team_allocation
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 4017875
num_examples: 250
download_size: 3910724
dataset_size: 4017875
- config_name: or_bench_80k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 189003930
num_examples: 20000
- name: val
num_bytes: 9395573
num_examples: 1000
download_size: 187840954
dataset_size: 198399503
- config_name: or_bench_hard_1k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 5306537
num_examples: 1055
- name: val
num_bytes: 1381187
num_examples: 264
download_size: 6254765
dataset_size: 6687724
- config_name: or_bench_toxic
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 2119280
num_examples: 524
download_size: 1969192
dataset_size: 2119280
- config_name: projudgebench
features:
- name: input
struct:
- name: correct_answer
dtype: string
- name: question
dtype: string
- name: step_to_evaluate
dtype: string
- name: steps
list: string
- name: prediction
struct:
- name: correct
dtype: bool
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 29590274
num_examples: 2160
- name: val
num_bytes: 3401140
num_examples: 240
download_size: 32036835
dataset_size: 32991414
- config_name: reward_bench_2
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 18528819
num_examples: 1492
- name: val
num_bytes: 4536715
num_examples: 373
download_size: 22304443
dataset_size: 23065534
- config_name: rod101_essay_scoring
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 2020517
num_examples: 81
download_size: 1998802
dataset_size: 2020517
- config_name: sem_eval_2010_task_8
features:
- name: input
struct:
- name: sentence
dtype: string
- name: prediction
struct:
- name: relation_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 57828971
num_examples: 8000
- name: val
num_bytes: 19487507
num_examples: 2717
download_size: 73437025
dataset_size: 77316478
- config_name: smollm_corpus
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: audience
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 272891783
num_examples: 20000
- name: val
num_bytes: 13627189
num_examples: 1000
download_size: 276746969
dataset_size: 286518972
- config_name: snli
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 40344375
num_examples: 10000
- name: val
num_bytes: 4051783
num_examples: 1000
download_size: 40834516
dataset_size: 44396158
- config_name: spartqa_mchoice
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 89469834
num_examples: 10000
- name: val
num_bytes: 9031816
num_examples: 1000
download_size: 92464877
dataset_size: 98501650
- config_name: toxigen_data
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 31453061
num_examples: 8960
- name: val
num_bytes: 3425975
num_examples: 940
download_size: 32151555
dataset_size: 34879036
- config_name: tweet_eval_emotion
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: emotion_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 11392425
num_examples: 3257
- name: val
num_bytes: 1276686
num_examples: 374
download_size: 11780047
dataset_size: 12669111
- config_name: tweet_eval_hate
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: hate_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 34921915
num_examples: 8993
- name: val
num_bytes: 4023313
num_examples: 999
download_size: 35897266
dataset_size: 38945228
- config_name: tweet_eval_irony
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: irony_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 10807970
num_examples: 2862
- name: val
num_bytes: 3625044
num_examples: 955
download_size: 13266041
dataset_size: 14433014
- config_name: tweet_eval_offensive
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: offensive_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 39973013
num_examples: 10000
- name: val
num_bytes: 3985297
num_examples: 1000
download_size: 40412187
dataset_size: 43958310
- config_name: tweet_eval_sentiment
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: sentiment_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 32734151
num_examples: 10000
- name: val
num_bytes: 3274690
num_examples: 1000
download_size: 33508797
dataset_size: 36008841
- config_name: ultrafeedback
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: instruction_following
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 109342887
num_examples: 10000
- name: val
num_bytes: 9934382
num_examples: 1000
download_size: 113911975
dataset_size: 119277269
- config_name: writingprompts_quality
features:
- name: input
struct:
- name: prompt
dtype: string
- name: story
dtype: string
- name: prediction
struct:
- name: quality_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 84703376
num_examples: 7000
- name: val
num_bytes: 12172517
num_examples: 1000
download_size: 91309291
dataset_size: 96875893
- config_name: yahoo_answers_quality
features:
- name: input
struct:
- name: answer
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 42630084
num_examples: 7000
- name: val
num_bytes: 6175186
num_examples: 1000
download_size: 45928270
dataset_size: 48805270
- config_name: yelp
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 105556973
num_examples: 20000
- name: val
num_bytes: 5291778
num_examples: 1000
download_size: 103446301
dataset_size: 110848751
configs:
- config_name: aes2_essay_scoring
data_files:
- split: train
path: aes2_essay_scoring/train-*
- split: val
path: aes2_essay_scoring/val-*
- config_name: anli_r1
data_files:
- split: train
path: anli_r1/train-*
- split: val
path: anli_r1/val-*
- config_name: anli_r2
data_files:
- split: train
path: anli_r2/train-*
- split: val
path: anli_r2/val-*
- config_name: anli_r3
data_files:
- split: train
path: anli_r3/train-*
- split: val
path: anli_r3/val-*
- config_name: arc_challenge
data_files:
- split: ood
path: arc_challenge/ood-*
- config_name: argument_quality_ranking
data_files:
- split: ood
path: argument_quality_ranking/ood-*
- config_name: big_patent_innovation
data_files:
- split: train
path: big_patent_innovation/train-*
- split: val
path: big_patent_innovation/val-*
- config_name: boardgame_qa
data_files:
- split: train
path: boardgame_qa/train-*
- split: val
path: boardgame_qa/val-*
- config_name: chatbot_arena_conversations
data_files:
- split: train
path: chatbot_arena_conversations/train-*
- split: val
path: chatbot_arena_conversations/val-*
- config_name: civil_comments
data_files:
- split: train
path: civil_comments/train-*
- split: val
path: civil_comments/val-*
- config_name: code_judge_bench
data_files:
- split: ood
path: code_judge_bench/ood-*
- config_name: colbert_humor_detection
data_files:
- split: train
path: colbert_humor_detection/train-*
- split: val
path: colbert_humor_detection/val-*
- config_name: customer_support_tickets_en
data_files:
- split: train
path: customer_support_tickets_en/train-*
- split: val
path: customer_support_tickets_en/val-*
- config_name: customer_support_tickets_gorkem
data_files:
- split: train
path: customer_support_tickets_gorkem/train-*
- split: val
path: customer_support_tickets_gorkem/val-*
- config_name: dbpedia_easy
data_files:
- split: train
path: dbpedia_easy/train-*
- split: val
path: dbpedia_easy/val-*
- config_name: dbpedia_hard
data_files:
- split: train
path: dbpedia_hard/train-*
- split: val
path: dbpedia_hard/val-*
- config_name: dbpedia_medium
data_files:
- split: train
path: dbpedia_medium/train-*
- split: val
path: dbpedia_medium/val-*
- config_name: enron_email_quality
data_files:
- split: train
path: enron_email_quality/train-*
- split: val
path: enron_email_quality/val-*
- config_name: enron_email_type
data_files:
- split: train
path: enron_email_type/train-*
- split: val
path: enron_email_type/val-*
- config_name: enron_reply_quality
data_files:
- split: train
path: enron_reply_quality/train-*
- split: val
path: enron_reply_quality/val-*
- config_name: go_emotions
data_files:
- split: train
path: go_emotions/train-*
- split: val
path: go_emotions/val-*
- config_name: gpqa_diamond
data_files:
- split: ood
path: gpqa_diamond/ood-*
- config_name: halueval_dialogue
data_files:
- split: train
path: halueval_dialogue/train-*
- split: val
path: halueval_dialogue/val-*
- config_name: halueval_qa
data_files:
- split: train
path: halueval_qa/train-*
- split: val
path: halueval_qa/val-*
- config_name: halueval_summarization
data_files:
- split: ood
path: halueval_summarization/ood-*
- config_name: hh_rlhf
data_files:
- split: train
path: hh_rlhf/train-*
- split: val
path: hh_rlhf/val-*
- config_name: judge_bench
data_files:
- split: ood
path: judge_bench/ood-*
- config_name: lex_glue_case_hold
data_files:
- split: train
path: lex_glue_case_hold/train-*
- split: val
path: lex_glue_case_hold/val-*
- config_name: lex_glue_ledgar
data_files:
- split: train
path: lex_glue_ledgar/train-*
- split: val
path: lex_glue_ledgar/val-*
- config_name: medical_abstracts
data_files:
- split: train
path: medical_abstracts/train-*
- split: val
path: medical_abstracts/val-*
- config_name: mfrc
data_files:
- split: train
path: mfrc/train-*
- split: val
path: mfrc/val-*
- config_name: mmlu
data_files:
- split: train
path: mmlu/train-*
- split: val
path: mmlu/val-*
- config_name: mmlu_pro
data_files:
- split: ood
path: mmlu_pro/ood-*
- config_name: mt_bench_human_judgments
data_files:
- split: ood
path: mt_bench_human_judgments/ood-*
- config_name: musr_murder_mysteries
data_files:
- split: ood
path: musr_murder_mysteries/ood-*
- config_name: musr_object_placements
data_files:
- split: ood
path: musr_object_placements/ood-*
- config_name: musr_team_allocation
data_files:
- split: ood
path: musr_team_allocation/ood-*
- config_name: or_bench_80k
data_files:
- split: train
path: or_bench_80k/train-*
- split: val
path: or_bench_80k/val-*
- config_name: or_bench_hard_1k
data_files:
- split: train
path: or_bench_hard_1k/train-*
- split: val
path: or_bench_hard_1k/val-*
- config_name: or_bench_toxic
data_files:
- split: ood
path: or_bench_toxic/ood-*
- config_name: projudgebench
data_files:
- split: train
path: projudgebench/train-*
- split: val
path: projudgebench/val-*
- config_name: reward_bench_2
data_files:
- split: train
path: reward_bench_2/train-*
- split: val
path: reward_bench_2/val-*
- config_name: rod101_essay_scoring
data_files:
- split: ood
path: rod101_essay_scoring/ood-*
- config_name: sem_eval_2010_task_8
data_files:
- split: train
path: sem_eval_2010_task_8/train-*
- split: val
path: sem_eval_2010_task_8/val-*
- config_name: smollm_corpus
data_files:
- split: train
path: smollm_corpus/train-*
- split: val
path: smollm_corpus/val-*
- config_name: snli
data_files:
- split: train
path: snli/train-*
- split: val
path: snli/val-*
- config_name: spartqa_mchoice
data_files:
- split: train
path: spartqa_mchoice/train-*
- split: val
path: spartqa_mchoice/val-*
- config_name: toxigen_data
data_files:
- split: train
path: toxigen_data/train-*
- split: val
path: toxigen_data/val-*
- config_name: tweet_eval_emotion
data_files:
- split: train
path: tweet_eval_emotion/train-*
- split: val
path: tweet_eval_emotion/val-*
- config_name: tweet_eval_hate
data_files:
- split: train
path: tweet_eval_hate/train-*
- split: val
path: tweet_eval_hate/val-*
- config_name: tweet_eval_irony
data_files:
- split: train
path: tweet_eval_irony/train-*
- split: val
path: tweet_eval_irony/val-*
- config_name: tweet_eval_offensive
data_files:
- split: train
path: tweet_eval_offensive/train-*
- split: val
path: tweet_eval_offensive/val-*
- config_name: tweet_eval_sentiment
data_files:
- split: train
path: tweet_eval_sentiment/train-*
- split: val
path: tweet_eval_sentiment/val-*
- config_name: ultrafeedback
data_files:
- split: train
path: ultrafeedback/train-*
- split: val
path: ultrafeedback/val-*
- config_name: writingprompts_quality
data_files:
- split: train
path: writingprompts_quality/train-*
- split: val
path: writingprompts_quality/val-*
- config_name: yahoo_answers_quality
data_files:
- split: train
path: yahoo_answers_quality/train-*
- split: val
path: yahoo_answers_quality/val-*
- config_name: yelp
data_files:
- split: train
path: yelp/train-*
- split: val
path: yelp/val-*
---
# gpt-oss-20b-v2-no-gepa
- Repo: `tytodd/gpt-oss-20b-v2-no-gepa`
- Model: `openai/gpt-oss-20b`
- Config: `/tmp/v2.yaml`
| benchmark | train | val | ood | all |
| --- | --- | --- | --- | --- |
| chatbot_arena_conversations | 78.58% | 75.00% | | 78.04% |
| hh_rlhf | 42.27% | 42.20% | | 42.26% |
| ultrafeedback | 34.40% | 36.90% | | 34.63% |
| projudgebench | 81.20% | 83.75% | | 81.46% |
| reward_bench_2 | 87.33% | 91.96% | | 88.26% |
| aes2_essay_scoring | 35.67% | 36.80% | | 35.77% |
| halueval_qa | 87.29% | 86.35% | | 87.10% |
| halueval_dialogue | 67.90% | 67.05% | | 67.73% |
| or_bench_80k | 30.56% | 34.70% | | 30.76% |
| or_bench_hard_1k | 62.75% | 48.11% | | 59.82% |
| toxigen_data | 84.48% | 83.09% | | 84.34% |
| civil_comments | 84.98% | 80.20% | | 84.55% |
| boardgame_qa | 89.51% | 90.90% | | 89.74% |
| go_emotions | 17.46% | 18.40% | | 17.55% |
| mfrc | 1.25% | 2.20% | | 1.34% |
| tweet_eval_emotion | 79.37% | 79.95% | | 79.43% |
| yelp | 60.23% | 58.80% | | 60.16% |
| tweet_eval_sentiment | 68.46% | 67.90% | | 68.41% |
| mmlu | 86.63% | 91.60% | | 86.87% |
| spartqa_mchoice | 77.32% | 73.50% | | 76.97% |
| anli_r1 | 86.80% | 83.80% | | 86.53% |
| anli_r2 | 89.85% | 77.80% | | 88.75% |
| anli_r3 | 70.71% | 66.60% | | 70.52% |
| snli | 76.54% | 78.70% | | 76.74% |
| sem_eval_2010_task_8 | 51.71% | 52.19% | | 51.83% |
| smollm_corpus | 53.52% | 53.70% | | 53.53% |
| medical_abstracts | 64.57% | 66.20% | | 64.72% |
| lex_glue_case_hold | 61.63% | 63.80% | | 61.83% |
| lex_glue_ledgar | 64.69% | 66.20% | | 64.83% |
| dbpedia_easy | 88.73% | 88.67% | | 88.70% |
| dbpedia_medium | 63.68% | 63.09% | | 63.39% |
| dbpedia_hard | 54.14% | 53.32% | | 53.73% |
| colbert_humor_detection | 84.44% | 84.20% | | 84.42% |
| tweet_eval_irony | 68.41% | 69.21% | | 68.61% |
| tweet_eval_hate | 73.23% | 68.37% | | 72.75% |
| tweet_eval_offensive | 72.38% | 73.00% | | 72.44% |
| customer_support_tickets_en | 21.94% | 22.40% | | 22.01% |
| customer_support_tickets_gorkem | 1.36% | 1.40% | | 1.36% |
| yahoo_answers_quality | 47.71% | 46.30% | | 47.54% |
| big_patent_innovation | 25.57% | 25.30% | | 25.53% |
| writingprompts_quality | 31.31% | 32.80% | | 31.50% |
| enron_email_type | 81.79% | 83.40% | | 81.99% |
| enron_email_quality | 25.43% | 25.90% | | 25.49% |
| enron_reply_quality | 62.30% | 59.10% | | 61.90% |
| argument_quality_ranking | | | 10.90% | 10.90% |
| rod101_essay_scoring | | | 30.86% | 30.86% |
| or_bench_toxic | | | 63.17% | 63.17% |
| judge_bench | | | 80.36% | 80.36% |
| musr_team_allocation | | | 72.40% | 72.40% |
| musr_object_placements | | | 53.12% | 53.12% |
| musr_murder_mysteries | | | 72.00% | 72.00% |
| halueval_summarization | | | 67.50% | 67.50% |
| code_judge_bench | | | 85.47% | 85.47% |
| mmlu_pro | | | 74.43% | 74.43% |
| gpqa_diamond | | | 65.15% | 65.15% |
| arc_challenge | | | 94.11% | 94.11% |
| mt_bench_human_judgments | | | 73.50% | 73.50% |
| all | 60.07% | 63.63% | 66.93% | 60.92% |
提供机构:
tytodd



