tytodd/qwen3.5-2b-v2-instructions-smoke-10k
收藏Hugging Face2026-04-21 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/tytodd/qwen3.5-2b-v2-instructions-smoke-10k
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: aes2_essay_scoring
features:
- name: input
struct:
- name: full_text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 7674595
num_examples: 100
- name: val
num_bytes: 8997845
num_examples: 100
download_size: 13953734
dataset_size: 16672440
- config_name: anli_r1
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2689102
num_examples: 100
- name: val
num_bytes: 2993379
num_examples: 100
download_size: 5631317
dataset_size: 5682481
- config_name: anli_r2
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2842760
num_examples: 100
- name: val
num_bytes: 4309746
num_examples: 100
download_size: 6508605
dataset_size: 7152506
- config_name: anli_r3
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3549414
num_examples: 100
- name: val
num_bytes: 3348075
num_examples: 100
download_size: 6804929
dataset_size: 6897489
- config_name: arc_challenge
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 1874037
num_examples: 100
download_size: 1819875
dataset_size: 1874037
- config_name: argument_quality_ranking
features:
- name: input
struct:
- name: argument
dtype: string
- name: topic
dtype: string
- name: prediction
struct:
- name: quality_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 2382442
num_examples: 100
download_size: 2341251
dataset_size: 2382442
- config_name: big_patent_innovation
features:
- name: input
struct:
- name: description
dtype: string
- name: prediction
struct:
- name: innovation_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3183511
num_examples: 100
- name: val
num_bytes: 6164448
num_examples: 100
download_size: 8346234
dataset_size: 9347959
- config_name: boardgame_qa
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4512282
num_examples: 100
- name: val
num_bytes: 4869644
num_examples: 100
download_size: 9164348
dataset_size: 9381926
- config_name: chatbot_arena_conversations
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4690027
num_examples: 100
- name: val
num_bytes: 5781284
num_examples: 100
download_size: 8464087
dataset_size: 10471311
- config_name: civil_comments
features:
- name: input
struct:
- name: comment
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2234241
num_examples: 100
- name: val
num_bytes: 2429092
num_examples: 100
download_size: 4645727
dataset_size: 4663333
- config_name: code_judge_bench
features:
- name: input
struct:
- name: code_A
dtype: string
- name: code_B
dtype: string
- name: problem
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 18976776
num_examples: 100
download_size: 15035853
dataset_size: 18976776
- config_name: colbert_humor_detection
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: humor_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2081242
num_examples: 100
- name: val
num_bytes: 2119166
num_examples: 100
download_size: 4180008
dataset_size: 4200408
- config_name: customer_support_tickets_en
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: queue
dtype: string
- name: type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3827342
num_examples: 100
- name: val
num_bytes: 3777509
num_examples: 100
download_size: 7574558
dataset_size: 7604851
- config_name: customer_support_tickets_gorkem
features:
- name: input
struct:
- name: ticket_text
dtype: string
- name: prediction
struct:
- name: ticket_subject
dtype: string
- name: ticket_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 5750200
num_examples: 100
- name: val
num_bytes: 3933406
num_examples: 100
download_size: 9592300
dataset_size: 9683606
- config_name: dbpedia_easy
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2509225
num_examples: 100
- name: val
num_bytes: 2493828
num_examples: 100
download_size: 4967980
dataset_size: 5003053
- config_name: dbpedia_hard
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: l2_class
dtype: string
- name: l3_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 5268953
num_examples: 100
- name: val
num_bytes: 3814853
num_examples: 100
download_size: 8658082
dataset_size: 9083806
- config_name: dbpedia_medium
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: l2_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4426902
num_examples: 100
- name: val
num_bytes: 3432780
num_examples: 100
download_size: 7829364
dataset_size: 7859682
- config_name: enron_email_quality
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: quality_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 7792455
num_examples: 100
- name: val
num_bytes: 4166537
num_examples: 100
download_size: 7487245
dataset_size: 11958992
- config_name: enron_email_type
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: email_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 7011915
num_examples: 100
- name: val
num_bytes: 9624771
num_examples: 100
download_size: 12007693
dataset_size: 16636686
- config_name: enron_reply_quality
features:
- name: input
struct:
- name: original_email
dtype: string
- name: reply
dtype: string
- name: prediction
struct:
- name: quality
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 11660143
num_examples: 100
- name: val
num_bytes: 10347401
num_examples: 100
download_size: 14251754
dataset_size: 22007544
- config_name: go_emotions
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: labels
list: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 407136
num_examples: 100
- name: val
num_bytes: 388148
num_examples: 100
download_size: 811990
dataset_size: 795284
- config_name: gpqa_diamond
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 6418255
num_examples: 100
download_size: 5484248
dataset_size: 6418255
- config_name: halueval_dialogue
features:
- name: input
struct:
- name: dialogue_history
dtype: string
- name: knowledge
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3241485
num_examples: 100
- name: val
num_bytes: 4594332
num_examples: 100
download_size: 7162286
dataset_size: 7835817
- config_name: halueval_qa
features:
- name: input
struct:
- name: answer
dtype: string
- name: knowledge
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3796661
num_examples: 100
- name: val
num_bytes: 4031643
num_examples: 100
download_size: 7172528
dataset_size: 7828304
- config_name: halueval_summarization
features:
- name: input
struct:
- name: document
dtype: string
- name: summary
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 4560376
num_examples: 100
download_size: 4072244
dataset_size: 4560376
- config_name: hh_rlhf
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4507099
num_examples: 100
- name: val
num_bytes: 5025776
num_examples: 100
download_size: 8106272
dataset_size: 9532875
- config_name: judge_bench
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 6019688
num_examples: 100
download_size: 5441888
dataset_size: 6019688
- config_name: lex_glue_case_hold
features:
- name: input
struct:
- name: context
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: option_e
dtype: string
- name: prediction
struct:
- name: selected_option
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4515218
num_examples: 100
- name: val
num_bytes: 5323856
num_examples: 100
download_size: 8779070
dataset_size: 9839074
- config_name: lex_glue_ledgar
features:
- name: input
struct:
- name: provision_text
dtype: string
- name: prediction
struct:
- name: provision_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3486578
num_examples: 100
- name: val
num_bytes: 3382567
num_examples: 100
download_size: 6839691
dataset_size: 6869145
- config_name: medical_abstracts
features:
- name: input
struct:
- name: medical_abstract
dtype: string
- name: prediction
struct:
- name: condition_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3414493
num_examples: 100
- name: val
num_bytes: 3400689
num_examples: 100
download_size: 6786876
dataset_size: 6815182
- config_name: mfrc
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: annotation
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2710715
num_examples: 100
- name: val
num_bytes: 2562696
num_examples: 100
download_size: 5224960
dataset_size: 5273411
- config_name: mmlu
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4322825
num_examples: 100
- name: val
num_bytes: 4010137
num_examples: 100
download_size: 8239519
dataset_size: 8332962
- config_name: mmlu_pro
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 3561156
num_examples: 100
download_size: 3500742
dataset_size: 3561156
- config_name: mt_bench_human_judgments
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 3491982
num_examples: 100
download_size: 3426522
dataset_size: 3491982
- config_name: musr_murder_mysteries
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 12043261
num_examples: 100
download_size: 8312806
dataset_size: 12043261
- config_name: musr_object_placements
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 4069139
num_examples: 100
download_size: 4010437
dataset_size: 4069139
- config_name: musr_team_allocation
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 4698879
num_examples: 100
download_size: 4603591
dataset_size: 4698879
- config_name: or_bench_80k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3556582
num_examples: 100
- name: val
num_bytes: 4377296
num_examples: 100
download_size: 7875773
dataset_size: 7933878
- config_name: or_bench_hard_1k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3593121
num_examples: 100
- name: val
num_bytes: 6230251
num_examples: 100
download_size: 8360776
dataset_size: 9823372
- config_name: or_bench_toxic
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 2105975
num_examples: 100
download_size: 2056204
dataset_size: 2105975
- config_name: projudgebench
features:
- name: input
struct:
- name: correct_answer
dtype: string
- name: question
dtype: string
- name: step_to_evaluate
dtype: string
- name: steps
list: string
- name: prediction
struct:
- name: correct
dtype: bool
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 5192258
num_examples: 100
- name: val
num_bytes: 5551965
num_examples: 100
download_size: 10706018
dataset_size: 10744223
- config_name: reward_bench_2
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 9386751
num_examples: 100
- name: val
num_bytes: 4982365
num_examples: 100
download_size: 11856339
dataset_size: 14369116
- config_name: rod101_essay_scoring
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 9716460
num_examples: 100
download_size: 6879387
dataset_size: 9716460
- config_name: sem_eval_2010_task_8
features:
- name: input
struct:
- name: sentence
dtype: string
- name: prediction
struct:
- name: relation_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 4012558
num_examples: 100
- name: val
num_bytes: 5463046
num_examples: 100
download_size: 8898067
dataset_size: 9475604
- config_name: smollm_corpus
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: audience
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 5833121
num_examples: 100
- name: val
num_bytes: 3806181
num_examples: 100
download_size: 8671905
dataset_size: 9639302
- config_name: snli
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2362523
num_examples: 100
- name: val
num_bytes: 2571094
num_examples: 100
download_size: 4877265
dataset_size: 4933617
- config_name: spartqa_mchoice
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 6385672
num_examples: 100
- name: val
num_bytes: 6555973
num_examples: 100
download_size: 12665503
dataset_size: 12941645
- config_name: toxigen_data
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2100204
num_examples: 100
- name: val
num_bytes: 2567171
num_examples: 100
download_size: 4488084
dataset_size: 4667375
- config_name: tweet_eval_emotion
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: emotion_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2461021
num_examples: 100
- name: val
num_bytes: 2526331
num_examples: 100
download_size: 4976727
dataset_size: 4987352
- config_name: tweet_eval_hate
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: hate_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2570214
num_examples: 100
- name: val
num_bytes: 2623364
num_examples: 100
download_size: 5158417
dataset_size: 5193578
- config_name: tweet_eval_irony
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: irony_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2587756
num_examples: 100
- name: val
num_bytes: 3898383
num_examples: 100
download_size: 5691652
dataset_size: 6486139
- config_name: tweet_eval_offensive
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: offensive_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2278350
num_examples: 100
- name: val
num_bytes: 5247161
num_examples: 100
download_size: 6583864
dataset_size: 7525511
- config_name: tweet_eval_sentiment
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: sentiment_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 2539094
num_examples: 100
- name: val
num_bytes: 2608997
num_examples: 100
download_size: 5124686
dataset_size: 5148091
- config_name: ultrafeedback
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: instruction_following
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3961343
num_examples: 100
- name: val
num_bytes: 6557603
num_examples: 100
download_size: 9559601
dataset_size: 10518946
- config_name: writingprompts_quality
features:
- name: input
struct:
- name: prompt
dtype: string
- name: story
dtype: string
- name: prediction
struct:
- name: quality_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 9455055
num_examples: 100
- name: val
num_bytes: 5435728
num_examples: 100
download_size: 11508374
dataset_size: 14890783
- config_name: yahoo_answers_quality
features:
- name: input
struct:
- name: answer
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 7939646
num_examples: 100
- name: val
num_bytes: 3147017
num_examples: 100
download_size: 8760811
dataset_size: 11086663
- config_name: yelp
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 3418240
num_examples: 100
- name: val
num_bytes: 4603872
num_examples: 100
download_size: 7189598
dataset_size: 8022112
configs:
- config_name: aes2_essay_scoring
data_files:
- split: train
path: aes2_essay_scoring/train-*
- split: val
path: aes2_essay_scoring/val-*
- config_name: anli_r1
data_files:
- split: train
path: anli_r1/train-*
- split: val
path: anli_r1/val-*
- config_name: anli_r2
data_files:
- split: train
path: anli_r2/train-*
- split: val
path: anli_r2/val-*
- config_name: anli_r3
data_files:
- split: train
path: anli_r3/train-*
- split: val
path: anli_r3/val-*
- config_name: arc_challenge
data_files:
- split: ood
path: arc_challenge/ood-*
- config_name: argument_quality_ranking
data_files:
- split: ood
path: argument_quality_ranking/ood-*
- config_name: big_patent_innovation
data_files:
- split: train
path: big_patent_innovation/train-*
- split: val
path: big_patent_innovation/val-*
- config_name: boardgame_qa
data_files:
- split: train
path: boardgame_qa/train-*
- split: val
path: boardgame_qa/val-*
- config_name: chatbot_arena_conversations
data_files:
- split: train
path: chatbot_arena_conversations/train-*
- split: val
path: chatbot_arena_conversations/val-*
- config_name: civil_comments
data_files:
- split: train
path: civil_comments/train-*
- split: val
path: civil_comments/val-*
- config_name: code_judge_bench
data_files:
- split: ood
path: code_judge_bench/ood-*
- config_name: colbert_humor_detection
data_files:
- split: train
path: colbert_humor_detection/train-*
- split: val
path: colbert_humor_detection/val-*
- config_name: customer_support_tickets_en
data_files:
- split: train
path: customer_support_tickets_en/train-*
- split: val
path: customer_support_tickets_en/val-*
- config_name: customer_support_tickets_gorkem
data_files:
- split: train
path: customer_support_tickets_gorkem/train-*
- split: val
path: customer_support_tickets_gorkem/val-*
- config_name: dbpedia_easy
data_files:
- split: train
path: dbpedia_easy/train-*
- split: val
path: dbpedia_easy/val-*
- config_name: dbpedia_hard
data_files:
- split: train
path: dbpedia_hard/train-*
- split: val
path: dbpedia_hard/val-*
- config_name: dbpedia_medium
data_files:
- split: train
path: dbpedia_medium/train-*
- split: val
path: dbpedia_medium/val-*
- config_name: enron_email_quality
data_files:
- split: train
path: enron_email_quality/train-*
- split: val
path: enron_email_quality/val-*
- config_name: enron_email_type
data_files:
- split: train
path: enron_email_type/train-*
- split: val
path: enron_email_type/val-*
- config_name: enron_reply_quality
data_files:
- split: train
path: enron_reply_quality/train-*
- split: val
path: enron_reply_quality/val-*
- config_name: go_emotions
data_files:
- split: train
path: go_emotions/train-*
- split: val
path: go_emotions/val-*
- config_name: gpqa_diamond
data_files:
- split: ood
path: gpqa_diamond/ood-*
- config_name: halueval_dialogue
data_files:
- split: train
path: halueval_dialogue/train-*
- split: val
path: halueval_dialogue/val-*
- config_name: halueval_qa
data_files:
- split: train
path: halueval_qa/train-*
- split: val
path: halueval_qa/val-*
- config_name: halueval_summarization
data_files:
- split: ood
path: halueval_summarization/ood-*
- config_name: hh_rlhf
data_files:
- split: train
path: hh_rlhf/train-*
- split: val
path: hh_rlhf/val-*
- config_name: judge_bench
data_files:
- split: ood
path: judge_bench/ood-*
- config_name: lex_glue_case_hold
data_files:
- split: train
path: lex_glue_case_hold/train-*
- split: val
path: lex_glue_case_hold/val-*
- config_name: lex_glue_ledgar
data_files:
- split: train
path: lex_glue_ledgar/train-*
- split: val
path: lex_glue_ledgar/val-*
- config_name: medical_abstracts
data_files:
- split: train
path: medical_abstracts/train-*
- split: val
path: medical_abstracts/val-*
- config_name: mfrc
data_files:
- split: train
path: mfrc/train-*
- split: val
path: mfrc/val-*
- config_name: mmlu
data_files:
- split: train
path: mmlu/train-*
- split: val
path: mmlu/val-*
- config_name: mmlu_pro
data_files:
- split: ood
path: mmlu_pro/ood-*
- config_name: mt_bench_human_judgments
data_files:
- split: ood
path: mt_bench_human_judgments/ood-*
- config_name: musr_murder_mysteries
data_files:
- split: ood
path: musr_murder_mysteries/ood-*
- config_name: musr_object_placements
data_files:
- split: ood
path: musr_object_placements/ood-*
- config_name: musr_team_allocation
data_files:
- split: ood
path: musr_team_allocation/ood-*
- config_name: or_bench_80k
data_files:
- split: train
path: or_bench_80k/train-*
- split: val
path: or_bench_80k/val-*
- config_name: or_bench_hard_1k
data_files:
- split: train
path: or_bench_hard_1k/train-*
- split: val
path: or_bench_hard_1k/val-*
- config_name: or_bench_toxic
data_files:
- split: ood
path: or_bench_toxic/ood-*
- config_name: projudgebench
data_files:
- split: train
path: projudgebench/train-*
- split: val
path: projudgebench/val-*
- config_name: reward_bench_2
data_files:
- split: train
path: reward_bench_2/train-*
- split: val
path: reward_bench_2/val-*
- config_name: rod101_essay_scoring
data_files:
- split: ood
path: rod101_essay_scoring/ood-*
- config_name: sem_eval_2010_task_8
data_files:
- split: train
path: sem_eval_2010_task_8/train-*
- split: val
path: sem_eval_2010_task_8/val-*
- config_name: smollm_corpus
data_files:
- split: train
path: smollm_corpus/train-*
- split: val
path: smollm_corpus/val-*
- config_name: snli
data_files:
- split: train
path: snli/train-*
- split: val
path: snli/val-*
- config_name: spartqa_mchoice
data_files:
- split: train
path: spartqa_mchoice/train-*
- split: val
path: spartqa_mchoice/val-*
- config_name: toxigen_data
data_files:
- split: train
path: toxigen_data/train-*
- split: val
path: toxigen_data/val-*
- config_name: tweet_eval_emotion
data_files:
- split: train
path: tweet_eval_emotion/train-*
- split: val
path: tweet_eval_emotion/val-*
- config_name: tweet_eval_hate
data_files:
- split: train
path: tweet_eval_hate/train-*
- split: val
path: tweet_eval_hate/val-*
- config_name: tweet_eval_irony
data_files:
- split: train
path: tweet_eval_irony/train-*
- split: val
path: tweet_eval_irony/val-*
- config_name: tweet_eval_offensive
data_files:
- split: train
path: tweet_eval_offensive/train-*
- split: val
path: tweet_eval_offensive/val-*
- config_name: tweet_eval_sentiment
data_files:
- split: train
path: tweet_eval_sentiment/train-*
- split: val
path: tweet_eval_sentiment/val-*
- config_name: ultrafeedback
data_files:
- split: train
path: ultrafeedback/train-*
- split: val
path: ultrafeedback/val-*
- config_name: writingprompts_quality
data_files:
- split: train
path: writingprompts_quality/train-*
- split: val
path: writingprompts_quality/val-*
- config_name: yahoo_answers_quality
data_files:
- split: train
path: yahoo_answers_quality/train-*
- split: val
path: yahoo_answers_quality/val-*
- config_name: yelp
data_files:
- split: train
path: yelp/train-*
- split: val
path: yelp/val-*
---
# qwen3.5-2b-v2-instructions-smoke-10k
- Repo: `tytodd/qwen3.5-2b-v2-instructions-smoke-10k`
- Model: `Qwen/Qwen3.5-2B`
- Config: `/tmp/v2-instructions-smoke-10k.yaml`
| benchmark | train | val | ood | all |
| --- | --- | --- | --- | --- |
| chatbot_arena_conversations | 66.00% | 71.00% | | 68.50% |
| hh_rlhf | 47.00% | 57.00% | | 52.00% |
| ultrafeedback | 40.00% | 31.00% | | 35.50% |
| projudgebench | 88.00% | 83.00% | | 85.50% |
| reward_bench_2 | 67.00% | 72.00% | | 69.50% |
| aes2_essay_scoring | 23.00% | 25.00% | | 24.00% |
| halueval_qa | 85.00% | 79.00% | | 82.00% |
| halueval_dialogue | 65.00% | 67.00% | | 66.00% |
| or_bench_80k | 20.00% | 31.00% | | 25.50% |
| or_bench_hard_1k | 56.00% | 55.00% | | 55.50% |
| toxigen_data | 76.00% | 70.00% | | 73.00% |
| civil_comments | 92.00% | 84.00% | | 88.00% |
| boardgame_qa | 89.00% | 90.00% | | 89.50% |
| go_emotions | 1.00% | 0.00% | | 0.50% |
| mfrc | 49.00% | 62.00% | | 55.50% |
| tweet_eval_emotion | 79.00% | 81.00% | | 80.00% |
| yelp | 63.00% | 44.00% | | 53.50% |
| tweet_eval_sentiment | 67.00% | 71.00% | | 69.00% |
| mmlu | 42.00% | 71.00% | | 56.50% |
| spartqa_mchoice | 82.00% | 71.00% | | 76.50% |
| anli_r1 | 85.00% | 77.00% | | 81.00% |
| anli_r2 | 91.00% | 69.00% | | 80.00% |
| anli_r3 | 53.00% | 65.00% | | 59.00% |
| snli | 82.00% | 73.00% | | 77.50% |
| sem_eval_2010_task_8 | 63.00% | 55.00% | | 59.00% |
| smollm_corpus | 52.00% | 42.00% | | 47.00% |
| medical_abstracts | 56.00% | 70.00% | | 63.00% |
| lex_glue_case_hold | 76.00% | 66.00% | | 71.00% |
| lex_glue_ledgar | 74.00% | 65.00% | | 69.50% |
| dbpedia_easy | 94.00% | 93.00% | | 93.50% |
| dbpedia_medium | 51.00% | 51.00% | | 51.00% |
| dbpedia_hard | 41.00% | 40.00% | | 40.50% |
| colbert_humor_detection | 72.00% | 72.00% | | 72.00% |
| tweet_eval_irony | 55.00% | 57.00% | | 56.00% |
| tweet_eval_hate | 76.00% | 63.00% | | 69.50% |
| tweet_eval_offensive | 77.00% | 73.00% | | 75.00% |
| customer_support_tickets_en | 51.00% | 31.00% | | 41.00% |
| customer_support_tickets_gorkem | 2.00% | 1.00% | | 1.50% |
| yahoo_answers_quality | 22.00% | 26.00% | | 24.00% |
| big_patent_innovation | 53.00% | 56.00% | | 54.50% |
| writingprompts_quality | 33.00% | 47.00% | | 40.00% |
| enron_email_type | 72.00% | 68.00% | | 70.00% |
| enron_email_quality | 17.00% | 11.00% | | 14.00% |
| enron_reply_quality | 63.00% | 58.00% | | 60.50% |
| argument_quality_ranking | | | 31.00% | 31.00% |
| rod101_essay_scoring | | | 23.00% | 23.00% |
| or_bench_toxic | | | 26.00% | 26.00% |
| judge_bench | | | 58.00% | 58.00% |
| musr_team_allocation | | | 69.00% | 69.00% |
| musr_object_placements | | | 51.00% | 51.00% |
| musr_murder_mysteries | | | 53.00% | 53.00% |
| halueval_summarization | | | 66.00% | 66.00% |
| code_judge_bench | | | 60.00% | 60.00% |
| mmlu_pro | | | 64.00% | 64.00% |
| gpqa_diamond | | | 53.00% | 53.00% |
| arc_challenge | | | 88.00% | 88.00% |
| mt_bench_human_judgments | | | 67.00% | 67.00% |
| all | 59.27% | 57.82% | 54.54% | 58.03% |
提供机构:
tytodd



