tytodd/qwen3.5-2b-v2-instructions-smoke
收藏Hugging Face2026-04-20 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/tytodd/qwen3.5-2b-v2-instructions-smoke
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: aes2_essay_scoring
features:
- name: input
struct:
- name: full_text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 380771
num_examples: 10
- name: val
num_bytes: 415898
num_examples: 10
download_size: 837244
dataset_size: 796669
- config_name: anli_r1
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 293138
num_examples: 10
- name: val
num_bytes: 288384
num_examples: 10
download_size: 583956
dataset_size: 581522
- config_name: anli_r2
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 278357
num_examples: 10
- name: val
num_bytes: 371844
num_examples: 10
download_size: 666938
dataset_size: 650201
- config_name: anli_r3
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 374051
num_examples: 10
- name: val
num_bytes: 336724
num_examples: 10
download_size: 736748
dataset_size: 710775
- config_name: arc_challenge
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 184998
num_examples: 10
download_size: 200146
dataset_size: 184998
- config_name: argument_quality_ranking
features:
- name: input
struct:
- name: argument
dtype: string
- name: topic
dtype: string
- name: prediction
struct:
- name: quality_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 208072
num_examples: 10
download_size: 219810
dataset_size: 208072
- config_name: big_patent_innovation
features:
- name: input
struct:
- name: description
dtype: string
- name: prediction
struct:
- name: innovation_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 351407
num_examples: 10
- name: val
num_bytes: 397242
num_examples: 10
download_size: 819171
dataset_size: 748649
- config_name: boardgame_qa
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 418585
num_examples: 10
- name: val
num_bytes: 424251
num_examples: 10
download_size: 879936
dataset_size: 842836
- config_name: chatbot_arena_conversations
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 317937
num_examples: 10
- name: val
num_bytes: 253924
num_examples: 10
download_size: 615801
dataset_size: 571861
- config_name: civil_comments
features:
- name: input
struct:
- name: comment
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 174880
num_examples: 10
- name: val
num_bytes: 193396
num_examples: 10
download_size: 400987
dataset_size: 368276
- config_name: code_judge_bench
features:
- name: input
struct:
- name: code_A
dtype: string
- name: code_B
dtype: string
- name: problem
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 876103
num_examples: 10
download_size: 897638
dataset_size: 876103
- config_name: colbert_humor_detection
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: humor_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 214397
num_examples: 10
- name: val
num_bytes: 222232
num_examples: 10
download_size: 471861
dataset_size: 436629
- config_name: customer_support_tickets_en
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: queue
dtype: string
- name: type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 342400
num_examples: 10
- name: val
num_bytes: 352874
num_examples: 10
download_size: 720266
dataset_size: 695274
- config_name: customer_support_tickets_gorkem
features:
- name: input
struct:
- name: ticket_text
dtype: string
- name: prediction
struct:
- name: ticket_subject
dtype: string
- name: ticket_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 444475
num_examples: 10
- name: val
num_bytes: 409339
num_examples: 10
download_size: 878123
dataset_size: 853814
- config_name: dbpedia_easy
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 242047
num_examples: 10
- name: val
num_bytes: 233794
num_examples: 10
download_size: 504333
dataset_size: 475841
- config_name: dbpedia_hard
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: l2_class
dtype: string
- name: l3_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 1639042
num_examples: 10
- name: val
num_bytes: 314634
num_examples: 10
download_size: 1366368
dataset_size: 1953676
- config_name: dbpedia_medium
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: l1_class
dtype: string
- name: l2_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 345938
num_examples: 10
- name: val
num_bytes: 414709
num_examples: 10
download_size: 789249
dataset_size: 760647
- config_name: enron_email_quality
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: quality_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 278376
num_examples: 10
- name: val
num_bytes: 301353
num_examples: 10
download_size: 448804
dataset_size: 579729
- config_name: enron_email_type
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: email_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 597016
num_examples: 10
- name: val
num_bytes: 586188
num_examples: 10
download_size: 925908
dataset_size: 1183204
- config_name: enron_reply_quality
features:
- name: input
struct:
- name: original_email
dtype: string
- name: reply
dtype: string
- name: prediction
struct:
- name: quality
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 658527
num_examples: 10
- name: val
num_bytes: 871623
num_examples: 10
download_size: 1128502
dataset_size: 1530150
- config_name: go_emotions
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: labels
list: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 53599
num_examples: 10
- name: val
num_bytes: 22724
num_examples: 10
download_size: 109830
dataset_size: 76323
- config_name: gpqa_diamond
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 433691
num_examples: 10
download_size: 456650
dataset_size: 433691
- config_name: halueval_dialogue
features:
- name: input
struct:
- name: dialogue_history
dtype: string
- name: knowledge
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 272340
num_examples: 10
- name: val
num_bytes: 338256
num_examples: 10
download_size: 632779
dataset_size: 610596
- config_name: halueval_qa
features:
- name: input
struct:
- name: answer
dtype: string
- name: knowledge
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 256153
num_examples: 10
- name: val
num_bytes: 298560
num_examples: 10
download_size: 579272
dataset_size: 554713
- config_name: halueval_summarization
features:
- name: input
struct:
- name: document
dtype: string
- name: summary
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 373607
num_examples: 10
download_size: 401220
dataset_size: 373607
- config_name: hh_rlhf
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 374104
num_examples: 10
- name: val
num_bytes: 376941
num_examples: 10
download_size: 791966
dataset_size: 751045
- config_name: judge_bench
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 533024
num_examples: 10
download_size: 574748
dataset_size: 533024
- config_name: lex_glue_case_hold
features:
- name: input
struct:
- name: context
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: option_e
dtype: string
- name: prediction
struct:
- name: selected_option
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 433496
num_examples: 10
- name: val
num_bytes: 376416
num_examples: 10
download_size: 856447
dataset_size: 809912
- config_name: lex_glue_ledgar
features:
- name: input
struct:
- name: provision_text
dtype: string
- name: prediction
struct:
- name: provision_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 299812
num_examples: 10
- name: val
num_bytes: 281843
num_examples: 10
download_size: 607069
dataset_size: 581655
- config_name: medical_abstracts
features:
- name: input
struct:
- name: medical_abstract
dtype: string
- name: prediction
struct:
- name: condition_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 326311
num_examples: 10
- name: val
num_bytes: 366021
num_examples: 10
download_size: 728243
dataset_size: 692332
- config_name: mfrc
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: annotation
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 270831
num_examples: 10
- name: val
num_bytes: 294338
num_examples: 10
download_size: 580106
dataset_size: 565169
- config_name: mmlu
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 397470
num_examples: 10
- name: val
num_bytes: 362216
num_examples: 10
download_size: 787134
dataset_size: 759686
- config_name: mmlu_pro
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 315832
num_examples: 10
download_size: 325955
dataset_size: 315832
- config_name: mt_bench_human_judgments
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 326775
num_examples: 10
download_size: 352517
dataset_size: 326775
- config_name: musr_murder_mysteries
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 2368992
num_examples: 10
download_size: 1438301
dataset_size: 2368992
- config_name: musr_object_placements
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 381974
num_examples: 10
download_size: 391291
dataset_size: 381974
- config_name: musr_team_allocation
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 387005
num_examples: 10
download_size: 413049
dataset_size: 387005
- config_name: or_bench_80k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 365568
num_examples: 10
- name: val
num_bytes: 309507
num_examples: 10
download_size: 693889
dataset_size: 675075
- config_name: or_bench_hard_1k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 293048
num_examples: 10
- name: val
num_bytes: 389036
num_examples: 10
download_size: 709008
dataset_size: 682084
- config_name: or_bench_toxic
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 215321
num_examples: 10
download_size: 226247
dataset_size: 215321
- config_name: projudgebench
features:
- name: input
struct:
- name: correct_answer
dtype: string
- name: question
dtype: string
- name: step_to_evaluate
dtype: string
- name: steps
list: string
- name: prediction
struct:
- name: correct
dtype: bool
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 393064
num_examples: 10
- name: val
num_bytes: 414637
num_examples: 10
download_size: 845849
dataset_size: 807701
- config_name: reward_bench_2
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 490275
num_examples: 10
- name: val
num_bytes: 451040
num_examples: 10
download_size: 1019602
dataset_size: 941315
- config_name: rod101_essay_scoring
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 378246
num_examples: 10
download_size: 403171
dataset_size: 378246
- config_name: sem_eval_2010_task_8
features:
- name: input
struct:
- name: sentence
dtype: string
- name: prediction
struct:
- name: relation_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 411139
num_examples: 10
- name: val
num_bytes: 424193
num_examples: 10
download_size: 864648
dataset_size: 835332
- config_name: smollm_corpus
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: audience
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 452908
num_examples: 10
- name: val
num_bytes: 303635
num_examples: 10
download_size: 798880
dataset_size: 756543
- config_name: snli
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 267327
num_examples: 10
- name: val
num_bytes: 247028
num_examples: 10
download_size: 535781
dataset_size: 514355
- config_name: spartqa_mchoice
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 632621
num_examples: 10
- name: val
num_bytes: 692466
num_examples: 10
download_size: 1346230
dataset_size: 1325087
- config_name: toxigen_data
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 204517
num_examples: 10
- name: val
num_bytes: 170384
num_examples: 10
download_size: 392217
dataset_size: 374901
- config_name: tweet_eval_emotion
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: emotion_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 245401
num_examples: 10
- name: val
num_bytes: 240618
num_examples: 10
download_size: 510493
dataset_size: 486019
- config_name: tweet_eval_hate
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: hate_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 258848
num_examples: 10
- name: val
num_bytes: 254810
num_examples: 10
download_size: 529005
dataset_size: 513658
- config_name: tweet_eval_irony
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: irony_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 238127
num_examples: 10
- name: val
num_bytes: 233031
num_examples: 10
download_size: 487587
dataset_size: 471158
- config_name: tweet_eval_offensive
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: offensive_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 258173
num_examples: 10
- name: val
num_bytes: 243102
num_examples: 10
download_size: 515336
dataset_size: 501275
- config_name: tweet_eval_sentiment
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: sentiment_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 242874
num_examples: 10
- name: val
num_bytes: 254186
num_examples: 10
download_size: 510627
dataset_size: 497060
- config_name: ultrafeedback
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: instruction_following
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 279930
num_examples: 10
- name: val
num_bytes: 475191
num_examples: 10
download_size: 797239
dataset_size: 755121
- config_name: writingprompts_quality
features:
- name: input
struct:
- name: prompt
dtype: string
- name: story
dtype: string
- name: prediction
struct:
- name: quality_score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 1875857
num_examples: 10
- name: val
num_bytes: 878749
num_examples: 10
download_size: 2045635
dataset_size: 2754606
- config_name: yahoo_answers_quality
features:
- name: input
struct:
- name: answer
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 259963
num_examples: 10
- name: val
num_bytes: 181132
num_examples: 10
download_size: 459378
dataset_size: 441095
- config_name: yelp
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 312138
num_examples: 10
- name: val
num_bytes: 366703
num_examples: 10
download_size: 699507
dataset_size: 678841
configs:
- config_name: aes2_essay_scoring
data_files:
- split: train
path: aes2_essay_scoring/train-*
- split: val
path: aes2_essay_scoring/val-*
- config_name: anli_r1
data_files:
- split: train
path: anli_r1/train-*
- split: val
path: anli_r1/val-*
- config_name: anli_r2
data_files:
- split: train
path: anli_r2/train-*
- split: val
path: anli_r2/val-*
- config_name: anli_r3
data_files:
- split: train
path: anli_r3/train-*
- split: val
path: anli_r3/val-*
- config_name: arc_challenge
data_files:
- split: ood
path: arc_challenge/ood-*
- config_name: argument_quality_ranking
data_files:
- split: ood
path: argument_quality_ranking/ood-*
- config_name: big_patent_innovation
data_files:
- split: train
path: big_patent_innovation/train-*
- split: val
path: big_patent_innovation/val-*
- config_name: boardgame_qa
data_files:
- split: train
path: boardgame_qa/train-*
- split: val
path: boardgame_qa/val-*
- config_name: chatbot_arena_conversations
data_files:
- split: train
path: chatbot_arena_conversations/train-*
- split: val
path: chatbot_arena_conversations/val-*
- config_name: civil_comments
data_files:
- split: train
path: civil_comments/train-*
- split: val
path: civil_comments/val-*
- config_name: code_judge_bench
data_files:
- split: ood
path: code_judge_bench/ood-*
- config_name: colbert_humor_detection
data_files:
- split: train
path: colbert_humor_detection/train-*
- split: val
path: colbert_humor_detection/val-*
- config_name: customer_support_tickets_en
data_files:
- split: train
path: customer_support_tickets_en/train-*
- split: val
path: customer_support_tickets_en/val-*
- config_name: customer_support_tickets_gorkem
data_files:
- split: train
path: customer_support_tickets_gorkem/train-*
- split: val
path: customer_support_tickets_gorkem/val-*
- config_name: dbpedia_easy
data_files:
- split: train
path: dbpedia_easy/train-*
- split: val
path: dbpedia_easy/val-*
- config_name: dbpedia_hard
data_files:
- split: train
path: dbpedia_hard/train-*
- split: val
path: dbpedia_hard/val-*
- config_name: dbpedia_medium
data_files:
- split: train
path: dbpedia_medium/train-*
- split: val
path: dbpedia_medium/val-*
- config_name: enron_email_quality
data_files:
- split: train
path: enron_email_quality/train-*
- split: val
path: enron_email_quality/val-*
- config_name: enron_email_type
data_files:
- split: train
path: enron_email_type/train-*
- split: val
path: enron_email_type/val-*
- config_name: enron_reply_quality
data_files:
- split: train
path: enron_reply_quality/train-*
- split: val
path: enron_reply_quality/val-*
- config_name: go_emotions
data_files:
- split: train
path: go_emotions/train-*
- split: val
path: go_emotions/val-*
- config_name: gpqa_diamond
data_files:
- split: ood
path: gpqa_diamond/ood-*
- config_name: halueval_dialogue
data_files:
- split: train
path: halueval_dialogue/train-*
- split: val
path: halueval_dialogue/val-*
- config_name: halueval_qa
data_files:
- split: train
path: halueval_qa/train-*
- split: val
path: halueval_qa/val-*
- config_name: halueval_summarization
data_files:
- split: ood
path: halueval_summarization/ood-*
- config_name: hh_rlhf
data_files:
- split: train
path: hh_rlhf/train-*
- split: val
path: hh_rlhf/val-*
- config_name: judge_bench
data_files:
- split: ood
path: judge_bench/ood-*
- config_name: lex_glue_case_hold
data_files:
- split: train
path: lex_glue_case_hold/train-*
- split: val
path: lex_glue_case_hold/val-*
- config_name: lex_glue_ledgar
data_files:
- split: train
path: lex_glue_ledgar/train-*
- split: val
path: lex_glue_ledgar/val-*
- config_name: medical_abstracts
data_files:
- split: train
path: medical_abstracts/train-*
- split: val
path: medical_abstracts/val-*
- config_name: mfrc
data_files:
- split: train
path: mfrc/train-*
- split: val
path: mfrc/val-*
- config_name: mmlu
data_files:
- split: train
path: mmlu/train-*
- split: val
path: mmlu/val-*
- config_name: mmlu_pro
data_files:
- split: ood
path: mmlu_pro/ood-*
- config_name: mt_bench_human_judgments
data_files:
- split: ood
path: mt_bench_human_judgments/ood-*
- config_name: musr_murder_mysteries
data_files:
- split: ood
path: musr_murder_mysteries/ood-*
- config_name: musr_object_placements
data_files:
- split: ood
path: musr_object_placements/ood-*
- config_name: musr_team_allocation
data_files:
- split: ood
path: musr_team_allocation/ood-*
- config_name: or_bench_80k
data_files:
- split: train
path: or_bench_80k/train-*
- split: val
path: or_bench_80k/val-*
- config_name: or_bench_hard_1k
data_files:
- split: train
path: or_bench_hard_1k/train-*
- split: val
path: or_bench_hard_1k/val-*
- config_name: or_bench_toxic
data_files:
- split: ood
path: or_bench_toxic/ood-*
- config_name: projudgebench
data_files:
- split: train
path: projudgebench/train-*
- split: val
path: projudgebench/val-*
- config_name: reward_bench_2
data_files:
- split: train
path: reward_bench_2/train-*
- split: val
path: reward_bench_2/val-*
- config_name: rod101_essay_scoring
data_files:
- split: ood
path: rod101_essay_scoring/ood-*
- config_name: sem_eval_2010_task_8
data_files:
- split: train
path: sem_eval_2010_task_8/train-*
- split: val
path: sem_eval_2010_task_8/val-*
- config_name: smollm_corpus
data_files:
- split: train
path: smollm_corpus/train-*
- split: val
path: smollm_corpus/val-*
- config_name: snli
data_files:
- split: train
path: snli/train-*
- split: val
path: snli/val-*
- config_name: spartqa_mchoice
data_files:
- split: train
path: spartqa_mchoice/train-*
- split: val
path: spartqa_mchoice/val-*
- config_name: toxigen_data
data_files:
- split: train
path: toxigen_data/train-*
- split: val
path: toxigen_data/val-*
- config_name: tweet_eval_emotion
data_files:
- split: train
path: tweet_eval_emotion/train-*
- split: val
path: tweet_eval_emotion/val-*
- config_name: tweet_eval_hate
data_files:
- split: train
path: tweet_eval_hate/train-*
- split: val
path: tweet_eval_hate/val-*
- config_name: tweet_eval_irony
data_files:
- split: train
path: tweet_eval_irony/train-*
- split: val
path: tweet_eval_irony/val-*
- config_name: tweet_eval_offensive
data_files:
- split: train
path: tweet_eval_offensive/train-*
- split: val
path: tweet_eval_offensive/val-*
- config_name: tweet_eval_sentiment
data_files:
- split: train
path: tweet_eval_sentiment/train-*
- split: val
path: tweet_eval_sentiment/val-*
- config_name: ultrafeedback
data_files:
- split: train
path: ultrafeedback/train-*
- split: val
path: ultrafeedback/val-*
- config_name: writingprompts_quality
data_files:
- split: train
path: writingprompts_quality/train-*
- split: val
path: writingprompts_quality/val-*
- config_name: yahoo_answers_quality
data_files:
- split: train
path: yahoo_answers_quality/train-*
- split: val
path: yahoo_answers_quality/val-*
- config_name: yelp
data_files:
- split: train
path: yelp/train-*
- split: val
path: yelp/val-*
---
# qwen3.5-2b-v2-instructions-smoke
- Repo: `tytodd/qwen3.5-2b-v2-instructions-smoke`
- Model: `Qwen/Qwen3.5-2B`
- Config: `/tmp/v2-instructions-smoke.yaml`
| benchmark | train | val | ood | all |
| --- | --- | --- | --- | --- |
| chatbot_arena_conversations | 60.00% | 30.00% | | 45.00% |
| hh_rlhf | 70.00% | 80.00% | | 75.00% |
| ultrafeedback | 20.00% | 50.00% | | 35.00% |
| projudgebench | 100.00% | 100.00% | | 100.00% |
| reward_bench_2 | 80.00% | 70.00% | | 75.00% |
| aes2_essay_scoring | 10.00% | 20.00% | | 15.00% |
| halueval_qa | 80.00% | 90.00% | | 85.00% |
| halueval_dialogue | 70.00% | 70.00% | | 70.00% |
| or_bench_80k | 10.00% | 30.00% | | 20.00% |
| or_bench_hard_1k | 90.00% | 40.00% | | 65.00% |
| toxigen_data | 70.00% | 90.00% | | 80.00% |
| civil_comments | 90.00% | 90.00% | | 90.00% |
| boardgame_qa | 100.00% | 100.00% | | 100.00% |
| go_emotions | 0.00% | 0.00% | | 0.00% |
| mfrc | 70.00% | 70.00% | | 70.00% |
| tweet_eval_emotion | 70.00% | 100.00% | | 85.00% |
| yelp | 70.00% | 60.00% | | 65.00% |
| tweet_eval_sentiment | 70.00% | 60.00% | | 65.00% |
| mmlu | 40.00% | 90.00% | | 65.00% |
| spartqa_mchoice | 100.00% | 60.00% | | 80.00% |
| anli_r1 | 90.00% | 60.00% | | 75.00% |
| anli_r2 | 90.00% | 50.00% | | 70.00% |
| anli_r3 | 50.00% | 80.00% | | 65.00% |
| snli | 80.00% | 70.00% | | 75.00% |
| sem_eval_2010_task_8 | 70.00% | 50.00% | | 60.00% |
| smollm_corpus | 60.00% | 20.00% | | 40.00% |
| medical_abstracts | 40.00% | 50.00% | | 45.00% |
| lex_glue_case_hold | 40.00% | 60.00% | | 50.00% |
| lex_glue_ledgar | 50.00% | 80.00% | | 65.00% |
| dbpedia_easy | 100.00% | 90.00% | | 95.00% |
| dbpedia_medium | 30.00% | 50.00% | | 40.00% |
| dbpedia_hard | 40.00% | 10.00% | | 25.00% |
| colbert_humor_detection | 80.00% | 50.00% | | 65.00% |
| tweet_eval_irony | 30.00% | 50.00% | | 40.00% |
| tweet_eval_hate | 70.00% | 70.00% | | 70.00% |
| tweet_eval_offensive | 80.00% | 60.00% | | 70.00% |
| customer_support_tickets_en | 50.00% | 60.00% | | 55.00% |
| customer_support_tickets_gorkem | 0.00% | 0.00% | | 0.00% |
| yahoo_answers_quality | 20.00% | 20.00% | | 20.00% |
| big_patent_innovation | 50.00% | 70.00% | | 60.00% |
| writingprompts_quality | 10.00% | 40.00% | | 25.00% |
| enron_email_type | 40.00% | 50.00% | | 45.00% |
| enron_email_quality | 10.00% | 0.00% | | 5.00% |
| enron_reply_quality | 50.00% | 60.00% | | 55.00% |
| argument_quality_ranking | | | 30.00% | 30.00% |
| rod101_essay_scoring | | | 10.00% | 10.00% |
| or_bench_toxic | | | 40.00% | 40.00% |
| judge_bench | | | 40.00% | 40.00% |
| musr_team_allocation | | | 70.00% | 70.00% |
| musr_object_placements | | | 40.00% | 40.00% |
| musr_murder_mysteries | | | 50.00% | 50.00% |
| halueval_summarization | | | 70.00% | 70.00% |
| code_judge_bench | | | 60.00% | 60.00% |
| mmlu_pro | | | 70.00% | 70.00% |
| gpqa_diamond | | | 30.00% | 30.00% |
| arc_challenge | | | 100.00% | 100.00% |
| mt_bench_human_judgments | | | 70.00% | 70.00% |
| all | 56.82% | 56.82% | 52.31% | 56.24% |
提供机构:
tytodd



