tytodd/qwen3.5-4b-smoke-test
收藏Hugging Face2026-03-23 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/tytodd/qwen3.5-4b-smoke-test
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: aes2_essay_scoring
features:
- name: input
struct:
- name: full_text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 699339
num_examples: 20
- name: val
num_bytes: 354604
num_examples: 10
download_size: 1084957
dataset_size: 1053943
- config_name: arc_challenge
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 98718
num_examples: 5
download_size: 120362
dataset_size: 98718
- config_name: argument_quality_ranking
features:
- name: input
struct:
- name: argument
dtype: string
- name: topic
dtype: string
- name: prediction
struct:
- name: quality_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 100163
num_examples: 5
download_size: 112373
dataset_size: 100163
- config_name: bbeh
features:
- name: input
struct:
- name: question
dtype: string
- name: task
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 316554
num_examples: 5
download_size: 319728
dataset_size: 316554
- config_name: bbh_causal_judgement
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 87696
num_examples: 5
download_size: 111499
dataset_size: 87696
- config_name: bbh_disambiguation_qa
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 91084
num_examples: 5
download_size: 106583
dataset_size: 91084
- config_name: bbh_geometric_shapes
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 92590
num_examples: 5
download_size: 109393
dataset_size: 92590
- config_name: bbh_movie_recommendation
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 103382
num_examples: 5
download_size: 120237
dataset_size: 103382
- config_name: bbh_reasoning_about_colored_objects
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 53388
num_examples: 5
download_size: 66814
dataset_size: 53388
- config_name: bbh_ruin_names
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 85838
num_examples: 5
download_size: 103803
dataset_size: 85838
- config_name: bbh_salient_translation_error_detection
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 119247
num_examples: 5
download_size: 143564
dataset_size: 119247
- config_name: bbh_snarks
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 85593
num_examples: 5
download_size: 101084
dataset_size: 85593
- config_name: bbh_sports_understanding
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 81203
num_examples: 5
download_size: 96571
dataset_size: 81203
- config_name: bbh_tracking_shuffled_objects_five_objects
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 100061
num_examples: 5
download_size: 116370
dataset_size: 100061
- config_name: bbh_web_of_lies
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: answer
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 79387
num_examples: 5
download_size: 89673
dataset_size: 79387
- config_name: civil_comments
features:
- name: input
struct:
- name: comment
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 178730
num_examples: 20
- name: val
num_bytes: 117530
num_examples: 10
download_size: 316634
dataset_size: 296260
- config_name: code_judge_bench
features:
- name: input
struct:
- name: code_A
dtype: string
- name: code_B
dtype: string
- name: problem
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 261694
num_examples: 5
download_size: 312409
dataset_size: 261694
- config_name: colbert_humor_detection
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: humor_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 270724
num_examples: 20
- name: val
num_bytes: 135346
num_examples: 10
download_size: 424878
dataset_size: 406070
- config_name: customer_support_tickets_en
features:
- name: input
struct:
- name: body
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: queue
dtype: string
- name: type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 460425
num_examples: 20
- name: val
num_bytes: 237580
num_examples: 10
download_size: 736315
dataset_size: 698005
- config_name: customer_support_tickets_gorkem
features:
- name: input
struct:
- name: ticket_text
dtype: string
- name: prediction
struct:
- name: ticket_subject
dtype: string
- name: ticket_type
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 251194
num_examples: 20
- name: val
num_bytes: 104441
num_examples: 10
download_size: 390077
dataset_size: 355635
- config_name: go_emotions
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: labels
list: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 356744
num_examples: 20
- name: val
num_bytes: 207716
num_examples: 10
download_size: 585978
dataset_size: 564460
- config_name: gpqa_diamond
features:
- name: input
struct:
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 149684
num_examples: 5
download_size: 173381
dataset_size: 149684
- config_name: halueval_summarization
features:
- name: input
struct:
- name: document
dtype: string
- name: summary
dtype: string
- name: prediction
struct:
- name: hallucination
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 153535
num_examples: 5
download_size: 174945
dataset_size: 153535
- config_name: hh_rlhf
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 353627
num_examples: 20
- name: val
num_bytes: 174115
num_examples: 10
download_size: 565487
dataset_size: 527742
- config_name: judge_bench
features:
- name: input
struct:
- name: question
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 238626
num_examples: 5
download_size: 287046
dataset_size: 238626
- config_name: lex_glue_case_hold
features:
- name: input
struct:
- name: context
dtype: string
- name: option_a
dtype: string
- name: option_b
dtype: string
- name: option_c
dtype: string
- name: option_d
dtype: string
- name: option_e
dtype: string
- name: prediction
struct:
- name: selected_option
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 626674
num_examples: 20
- name: val
num_bytes: 310730
num_examples: 10
download_size: 981668
dataset_size: 937404
- config_name: lex_glue_scotus
features:
- name: input
struct:
- name: opinion_text
dtype: string
- name: prediction
struct:
- name: issue_id
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 1692204
num_examples: 20
- name: val
num_bytes: 1324105
num_examples: 10
download_size: 3030377
dataset_size: 3016309
- config_name: medical_abstracts
features:
- name: input
struct:
- name: medical_abstract
dtype: string
- name: prediction
struct:
- name: condition_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 304294
num_examples: 20
- name: val
num_bytes: 208689
num_examples: 10
download_size: 556264
dataset_size: 512983
- config_name: mfrc
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: annotation
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 404673
num_examples: 20
- name: val
num_bytes: 213994
num_examples: 10
download_size: 639407
dataset_size: 618667
- config_name: mmlu
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 83362
num_examples: 5
download_size: 107724
dataset_size: 83362
- config_name: mmlu_pro
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 135363
num_examples: 5
download_size: 151147
dataset_size: 135363
- config_name: musr_murder_mysteries
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 187833
num_examples: 5
download_size: 199396
dataset_size: 187833
- config_name: musr_object_placements
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 162252
num_examples: 5
download_size: 175079
dataset_size: 162252
- config_name: musr_team_allocation
features:
- name: input
struct:
- name: choices
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: choice
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 148087
num_examples: 5
download_size: 182083
dataset_size: 148087
- config_name: or_bench_80k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 471701
num_examples: 20
- name: val
num_bytes: 277515
num_examples: 10
download_size: 767369
dataset_size: 749216
- config_name: or_bench_hard_1k
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 438271
num_examples: 20
- name: val
num_bytes: 170584
num_examples: 10
download_size: 633701
dataset_size: 608855
- config_name: or_bench_toxic
features:
- name: input
struct:
- name: prompt
dtype: string
- name: prediction
struct:
- name: or_bench_category
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 108971
num_examples: 5
download_size: 126115
dataset_size: 108971
- config_name: projudgebench
features:
- name: input
struct:
- name: correct_answer
dtype: string
- name: question
dtype: string
- name: step_to_evaluate
dtype: string
- name: steps
list: string
- name: prediction
struct:
- name: correct
dtype: bool
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 674735
num_examples: 20
- name: val
num_bytes: 266862
num_examples: 10
download_size: 987341
dataset_size: 941597
- config_name: reward_bench_2
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response_A
dtype: string
- name: response_B
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 554311
num_examples: 20
- name: val
num_bytes: 266437
num_examples: 10
download_size: 889958
dataset_size: 820748
- config_name: rod101_essay_scoring
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: score
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: ood
num_bytes: 116781
num_examples: 5
download_size: 148633
dataset_size: 116781
- config_name: seekbench
features:
- name: input
struct:
- name: current_trace
dtype: string
- name: previous_traces
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: groundness
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 261853
num_examples: 20
- name: val
num_bytes: 182333
num_examples: 10
download_size: 481073
dataset_size: 444186
- config_name: seekbench_evidence
features:
- name: input
struct:
- name: current_trace
dtype: string
- name: previous_traces
dtype: string
- name: question
dtype: string
- name: prediction
struct:
- name: clear
dtype: string
- name: sufficient
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 396579
num_examples: 20
- name: val
num_bytes: 298885
num_examples: 10
download_size: 739067
dataset_size: 695464
- config_name: seekbench_full_trace
features:
- name: input
struct:
- name: final_answer
dtype: string
- name: question
dtype: string
- name: trace
dtype: string
- name: prediction
struct:
- name: correctness
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 479169
num_examples: 20
- name: val
num_bytes: 345751
num_examples: 10
download_size: 841025
dataset_size: 824920
- config_name: sem_eval_2010_task_8
features:
- name: input
struct:
- name: sentence
dtype: string
- name: prediction
struct:
- name: relation_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 419023
num_examples: 20
- name: val
num_bytes: 181226
num_examples: 10
download_size: 629294
dataset_size: 600249
- config_name: smollm_corpus
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: audience
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 645855
num_examples: 20
- name: val
num_bytes: 295932
num_examples: 10
download_size: 987375
dataset_size: 941787
- config_name: snli
features:
- name: input
struct:
- name: hypothesis
dtype: string
- name: premise
dtype: string
- name: prediction
struct:
- name: label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 412265
num_examples: 20
- name: val
num_bytes: 221303
num_examples: 10
download_size: 653175
dataset_size: 633568
- config_name: support_tickets_alpha
features:
- name: input
struct:
- name: description
dtype: string
- name: subject
dtype: string
- name: prediction
struct:
- name: key_phrase
dtype: string
- name: support_class
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 305422
num_examples: 20
- name: val
num_bytes: 96397
num_examples: 10
download_size: 429034
dataset_size: 401819
- config_name: toxigen_data
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: toxicity_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 216991
num_examples: 20
- name: val
num_bytes: 87863
num_examples: 10
download_size: 325212
dataset_size: 304854
- config_name: tweet_eval_emotion
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: emotion_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 166840
num_examples: 20
- name: val
num_bytes: 108419
num_examples: 10
download_size: 295387
dataset_size: 275259
- config_name: tweet_eval_hate
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: hate_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 187776
num_examples: 20
- name: val
num_bytes: 105180
num_examples: 10
download_size: 307441
dataset_size: 292956
- config_name: tweet_eval_irony
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: irony_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 204742
num_examples: 20
- name: val
num_bytes: 133996
num_examples: 10
download_size: 361374
dataset_size: 338738
- config_name: tweet_eval_offensive
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: offensive_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 190142
num_examples: 20
- name: val
num_bytes: 81634
num_examples: 10
download_size: 292590
dataset_size: 271776
- config_name: tweet_eval_sentiment
features:
- name: input
struct:
- name: tweet
dtype: string
- name: prediction
struct:
- name: sentiment_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 249975
num_examples: 20
- name: val
num_bytes: 78152
num_examples: 10
download_size: 347367
dataset_size: 328127
- config_name: tweet_eval_stance_abortion
features:
- name: input
struct:
- name: topic
dtype: string
- name: tweet
dtype: string
- name: prediction
struct:
- name: stance_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 254762
num_examples: 20
- name: val
num_bytes: 103078
num_examples: 10
download_size: 382455
dataset_size: 357840
- config_name: tweet_eval_stance_atheism
features:
- name: input
struct:
- name: topic
dtype: string
- name: tweet
dtype: string
- name: prediction
struct:
- name: stance_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 205875
num_examples: 20
- name: val
num_bytes: 109055
num_examples: 10
download_size: 338343
dataset_size: 314930
- config_name: tweet_eval_stance_climate
features:
- name: input
struct:
- name: topic
dtype: string
- name: tweet
dtype: string
- name: prediction
struct:
- name: stance_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 229955
num_examples: 20
- name: val
num_bytes: 130852
num_examples: 10
download_size: 379768
dataset_size: 360807
- config_name: tweet_eval_stance_feminist
features:
- name: input
struct:
- name: topic
dtype: string
- name: tweet
dtype: string
- name: prediction
struct:
- name: stance_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 270732
num_examples: 20
- name: val
num_bytes: 158072
num_examples: 10
download_size: 450611
dataset_size: 428804
- config_name: tweet_eval_stance_hillary
features:
- name: input
struct:
- name: topic
dtype: string
- name: tweet
dtype: string
- name: prediction
struct:
- name: stance_label
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 249746
num_examples: 20
- name: val
num_bytes: 101358
num_examples: 10
download_size: 371103
dataset_size: 351104
- config_name: ultrafeedback
features:
- name: input
struct:
- name: prompt
dtype: string
- name: response
dtype: string
- name: prediction
struct:
- name: instruction_following
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 409873
num_examples: 20
- name: val
num_bytes: 170167
num_examples: 10
download_size: 630154
dataset_size: 580040
- config_name: yelp
features:
- name: input
struct:
- name: text
dtype: string
- name: prediction
struct:
- name: rating
dtype: string
- name: reasoning
dtype: string
- name: messages
struct:
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
- name: outputs
struct:
- name: reasoning_content
dtype: string
- name: text
dtype: string
- name: correct
dtype: bool
splits:
- name: train
num_bytes: 303906
num_examples: 20
- name: val
num_bytes: 184947
num_examples: 10
download_size: 524495
dataset_size: 488853
configs:
- config_name: aes2_essay_scoring
data_files:
- split: train
path: aes2_essay_scoring/train-*
- split: val
path: aes2_essay_scoring/val-*
- config_name: arc_challenge
data_files:
- split: ood
path: arc_challenge/ood-*
- config_name: argument_quality_ranking
data_files:
- split: ood
path: argument_quality_ranking/ood-*
- config_name: bbeh
data_files:
- split: ood
path: bbeh/ood-*
- config_name: bbh_causal_judgement
data_files:
- split: ood
path: bbh_causal_judgement/ood-*
- config_name: bbh_disambiguation_qa
data_files:
- split: ood
path: bbh_disambiguation_qa/ood-*
- config_name: bbh_geometric_shapes
data_files:
- split: ood
path: bbh_geometric_shapes/ood-*
- config_name: bbh_movie_recommendation
data_files:
- split: ood
path: bbh_movie_recommendation/ood-*
- config_name: bbh_reasoning_about_colored_objects
data_files:
- split: ood
path: bbh_reasoning_about_colored_objects/ood-*
- config_name: bbh_ruin_names
data_files:
- split: ood
path: bbh_ruin_names/ood-*
- config_name: bbh_salient_translation_error_detection
data_files:
- split: ood
path: bbh_salient_translation_error_detection/ood-*
- config_name: bbh_snarks
data_files:
- split: ood
path: bbh_snarks/ood-*
- config_name: bbh_sports_understanding
data_files:
- split: ood
path: bbh_sports_understanding/ood-*
- config_name: bbh_tracking_shuffled_objects_five_objects
data_files:
- split: ood
path: bbh_tracking_shuffled_objects_five_objects/ood-*
- config_name: bbh_web_of_lies
data_files:
- split: ood
path: bbh_web_of_lies/ood-*
- config_name: civil_comments
data_files:
- split: train
path: civil_comments/train-*
- split: val
path: civil_comments/val-*
- config_name: code_judge_bench
data_files:
- split: ood
path: code_judge_bench/ood-*
- config_name: colbert_humor_detection
data_files:
- split: train
path: colbert_humor_detection/train-*
- split: val
path: colbert_humor_detection/val-*
- config_name: customer_support_tickets_en
data_files:
- split: train
path: customer_support_tickets_en/train-*
- split: val
path: customer_support_tickets_en/val-*
- config_name: customer_support_tickets_gorkem
data_files:
- split: train
path: customer_support_tickets_gorkem/train-*
- split: val
path: customer_support_tickets_gorkem/val-*
- config_name: go_emotions
data_files:
- split: train
path: go_emotions/train-*
- split: val
path: go_emotions/val-*
- config_name: gpqa_diamond
data_files:
- split: ood
path: gpqa_diamond/ood-*
- config_name: halueval_summarization
data_files:
- split: ood
path: halueval_summarization/ood-*
- config_name: hh_rlhf
data_files:
- split: train
path: hh_rlhf/train-*
- split: val
path: hh_rlhf/val-*
- config_name: judge_bench
data_files:
- split: ood
path: judge_bench/ood-*
- config_name: lex_glue_case_hold
data_files:
- split: train
path: lex_glue_case_hold/train-*
- split: val
path: lex_glue_case_hold/val-*
- config_name: lex_glue_scotus
data_files:
- split: train
path: lex_glue_scotus/train-*
- split: val
path: lex_glue_scotus/val-*
- config_name: medical_abstracts
data_files:
- split: train
path: medical_abstracts/train-*
- split: val
path: medical_abstracts/val-*
- config_name: mfrc
data_files:
- split: train
path: mfrc/train-*
- split: val
path: mfrc/val-*
- config_name: mmlu
data_files:
- split: ood
path: mmlu/ood-*
- config_name: mmlu_pro
data_files:
- split: ood
path: mmlu_pro/ood-*
- config_name: musr_murder_mysteries
data_files:
- split: ood
path: musr_murder_mysteries/ood-*
- config_name: musr_object_placements
data_files:
- split: ood
path: musr_object_placements/ood-*
- config_name: musr_team_allocation
data_files:
- split: ood
path: musr_team_allocation/ood-*
- config_name: or_bench_80k
data_files:
- split: train
path: or_bench_80k/train-*
- split: val
path: or_bench_80k/val-*
- config_name: or_bench_hard_1k
data_files:
- split: train
path: or_bench_hard_1k/train-*
- split: val
path: or_bench_hard_1k/val-*
- config_name: or_bench_toxic
data_files:
- split: ood
path: or_bench_toxic/ood-*
- config_name: projudgebench
data_files:
- split: train
path: projudgebench/train-*
- split: val
path: projudgebench/val-*
- config_name: reward_bench_2
data_files:
- split: train
path: reward_bench_2/train-*
- split: val
path: reward_bench_2/val-*
- config_name: rod101_essay_scoring
data_files:
- split: ood
path: rod101_essay_scoring/ood-*
- config_name: seekbench
data_files:
- split: train
path: seekbench/train-*
- split: val
path: seekbench/val-*
- config_name: seekbench_evidence
data_files:
- split: train
path: seekbench_evidence/train-*
- split: val
path: seekbench_evidence/val-*
- config_name: seekbench_full_trace
data_files:
- split: train
path: seekbench_full_trace/train-*
- split: val
path: seekbench_full_trace/val-*
- config_name: sem_eval_2010_task_8
data_files:
- split: train
path: sem_eval_2010_task_8/train-*
- split: val
path: sem_eval_2010_task_8/val-*
- config_name: smollm_corpus
data_files:
- split: train
path: smollm_corpus/train-*
- split: val
path: smollm_corpus/val-*
- config_name: snli
data_files:
- split: train
path: snli/train-*
- split: val
path: snli/val-*
- config_name: support_tickets_alpha
data_files:
- split: train
path: support_tickets_alpha/train-*
- split: val
path: support_tickets_alpha/val-*
- config_name: toxigen_data
data_files:
- split: train
path: toxigen_data/train-*
- split: val
path: toxigen_data/val-*
- config_name: tweet_eval_emotion
data_files:
- split: train
path: tweet_eval_emotion/train-*
- split: val
path: tweet_eval_emotion/val-*
- config_name: tweet_eval_hate
data_files:
- split: train
path: tweet_eval_hate/train-*
- split: val
path: tweet_eval_hate/val-*
- config_name: tweet_eval_irony
data_files:
- split: train
path: tweet_eval_irony/train-*
- split: val
path: tweet_eval_irony/val-*
- config_name: tweet_eval_offensive
data_files:
- split: train
path: tweet_eval_offensive/train-*
- split: val
path: tweet_eval_offensive/val-*
- config_name: tweet_eval_sentiment
data_files:
- split: train
path: tweet_eval_sentiment/train-*
- split: val
path: tweet_eval_sentiment/val-*
- config_name: tweet_eval_stance_abortion
data_files:
- split: train
path: tweet_eval_stance_abortion/train-*
- split: val
path: tweet_eval_stance_abortion/val-*
- config_name: tweet_eval_stance_atheism
data_files:
- split: train
path: tweet_eval_stance_atheism/train-*
- split: val
path: tweet_eval_stance_atheism/val-*
- config_name: tweet_eval_stance_climate
data_files:
- split: train
path: tweet_eval_stance_climate/train-*
- split: val
path: tweet_eval_stance_climate/val-*
- config_name: tweet_eval_stance_feminist
data_files:
- split: train
path: tweet_eval_stance_feminist/train-*
- split: val
path: tweet_eval_stance_feminist/val-*
- config_name: tweet_eval_stance_hillary
data_files:
- split: train
path: tweet_eval_stance_hillary/train-*
- split: val
path: tweet_eval_stance_hillary/val-*
- config_name: ultrafeedback
data_files:
- split: train
path: ultrafeedback/train-*
- split: val
path: ultrafeedback/val-*
- config_name: yelp
data_files:
- split: train
path: yelp/train-*
- split: val
path: yelp/val-*
---
# qwen3.5-4b-smoke-test
- Repo: `tytodd/qwen3.5-4b-smoke-test`
- Local path: `datasets/qwen3.5-4b-smoke-test`
- Config: `configs/datasets/smoke-test/smoke-test.yaml`
| benchmark | train | val | ood | all |
| --- | --- | --- | --- | --- |
| customer_support_tickets_gorkem | 5.00% | 0.00% | | 3.33% |
| mfrc | 0.00% | 0.00% | | 0.00% |
| go_emotions | 35.00% | 20.00% | | 30.00% |
| customer_support_tickets_en | 45.00% | 30.00% | | 40.00% |
| aes2_essay_scoring | 65.00% | 30.00% | | 53.33% |
| ultrafeedback | 40.00% | 20.00% | | 33.33% |
| smollm_corpus | 60.00% | 60.00% | | 60.00% |
| or_bench_80k | 20.00% | 30.00% | | 23.33% |
| lex_glue_scotus | 75.00% | 90.00% | | 80.00% |
| medical_abstracts | 65.00% | 50.00% | | 60.00% |
| seekbench_evidence | 35.00% | 50.00% | | 40.00% |
| yelp | 70.00% | 70.00% | | 70.00% |
| tweet_eval_sentiment | 50.00% | 70.00% | | 56.67% |
| hh_rlhf | 65.00% | 50.00% | | 60.00% |
| tweet_eval_stance_hillary | 50.00% | 50.00% | | 50.00% |
| tweet_eval_stance_atheism | 85.00% | 70.00% | | 80.00% |
| reward_bench_2 | 85.00% | 90.00% | | 86.67% |
| sem_eval_2010_task_8 | 45.00% | 70.00% | | 53.33% |
| seekbench | 55.00% | 60.00% | | 56.67% |
| tweet_eval_irony | 70.00% | 60.00% | | 66.67% |
| tweet_eval_offensive | 80.00% | 100.00% | | 86.67% |
| lex_glue_case_hold | 70.00% | 80.00% | | 73.33% |
| seekbench_full_trace | 75.00% | 70.00% | | 73.33% |
| tweet_eval_hate | 65.00% | 70.00% | | 66.67% |
| or_bench_hard_1k | 65.00% | 80.00% | | 70.00% |
| tweet_eval_stance_climate | 60.00% | 60.00% | | 60.00% |
| snli | 85.00% | 90.00% | | 86.67% |
| tweet_eval_stance_feminist | 65.00% | 80.00% | | 70.00% |
| tweet_eval_stance_abortion | 85.00% | 60.00% | | 76.67% |
| toxigen_data | 90.00% | 90.00% | | 90.00% |
| tweet_eval_emotion | 70.00% | 100.00% | | 80.00% |
| civil_comments | 80.00% | 100.00% | | 86.67% |
| projudgebench | 100.00% | 100.00% | | 100.00% |
| colbert_humor_detection | 90.00% | 80.00% | | 86.67% |
| support_tickets_alpha | 80.00% | 90.00% | | 83.33% |
| argument_quality_ranking | | | 60.00% | 60.00% |
| rod101_essay_scoring | | | 20.00% | 20.00% |
| or_bench_toxic | | | 80.00% | 80.00% |
| judge_bench | | | 60.00% | 60.00% |
| musr_team_allocation | | | 60.00% | 60.00% |
| musr_object_placements | | | 40.00% | 40.00% |
| bbh_disambiguation_qa | | | 100.00% | 100.00% |
| bbh_causal_judgement | | | 60.00% | 60.00% |
| musr_murder_mysteries | | | 80.00% | 80.00% |
| halueval_summarization | | | 60.00% | 60.00% |
| bbh_salient_translation_error_detection | | | 20.00% | 20.00% |
| bbh_movie_recommendation | | | 80.00% | 80.00% |
| bbh_sports_understanding | | | 60.00% | 60.00% |
| bbeh | | | 60.00% | 60.00% |
| bbh_geometric_shapes | | | 80.00% | 80.00% |
| code_judge_bench | | | 80.00% | 80.00% |
| mmlu_pro | | | 40.00% | 40.00% |
| bbh_ruin_names | | | 80.00% | 80.00% |
| bbh_snarks | | | 80.00% | 80.00% |
| gpqa_diamond | | | 80.00% | 80.00% |
| bbh_web_of_lies | | | 100.00% | 100.00% |
| mmlu | | | 100.00% | 100.00% |
| bbh_reasoning_about_colored_objects | | | 100.00% | 100.00% |
| bbh_tracking_shuffled_objects_five_objects | | | 100.00% | 100.00% |
| arc_challenge | | | 100.00% | 100.00% |
| all | 62.29% | 63.43% | 71.20% | 63.57% |
提供机构:
tytodd



