SAIFIINDUSTRIES/training_data_1
收藏Hugging Face2026-04-14 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/SAIFIINDUSTRIES/training_data_1
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: cosmopedia_5gb
features:
- name: text_token_length
dtype: int64
- name: prompt
dtype: string
- name: text
dtype: string
- name: seed_data
dtype: string
- name: format
dtype: string
- name: audience
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 529415
download_size: 1630040202
dataset_size: 0
- config_name: cosmopedia_basics
features:
- name: prompt
dtype: string
- name: text_token_length
dtype: int64
- name: text
dtype: string
- name: seed_data
dtype: string
- name: format
dtype: string
- name: audience
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 24123
download_size: 47721180
dataset_size: 0
- config_name: dense_clean_pajama
features:
- name: text
dtype: string
- name: meta
struct:
- name: redpajama_set_name
dtype: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 0
num_examples: 1241940
download_size: 3134343892
dataset_size: 0
- config_name: dense_knowledge_edu
features:
- name: text
dtype: string
- name: id
dtype: string
- name: dump
dtype: string
- name: url
dtype: string
- name: file_path
dtype: string
- name: language
dtype: string
- name: language_score
dtype: float64
- name: token_count
dtype: int64
- name: score
dtype: float64
- name: int_score
dtype: int64
splits:
- name: train
num_bytes: 0
num_examples: 1666674
download_size: 4855486465
dataset_size: 0
- config_name: dense_reasoning_hermes
features:
- name: custom_instruction
dtype: bool
- name: topic
dtype: string
- name: model_name
dtype: string
- name: model
dtype: string
- name: skip_prompt_formatting
dtype: bool
- name: category
dtype: string
- name: conversations
list:
- name: from
dtype: string
- name: value
dtype: string
- name: weight
dtype: float64
- name: views
dtype: int64
- name: language
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: idx
dtype: string
- name: hash
list: int64
- name: avatarUrl
dtype: string
- name: system_prompt
dtype: string
- name: source
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 500776
download_size: 731897901
dataset_size: 0
- config_name: evol_instruct_code
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 110000
download_size: 128774102
dataset_size: 0
- config_name: multi_hindi
features:
- name: text
dtype: string
- name: timestamp
dtype: string
- name: url
dtype: string
- name: source
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 240000
download_size: 685840985
dataset_size: 0
- config_name: numinamath_cot
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 200000
download_size: 136940076
dataset_size: 0
- config_name: openthoughts_cot
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 37986
download_size: 443183937
dataset_size: 0
- config_name: orca_math_cot
features:
- name: question
dtype: string
- name: answer
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 200035
download_size: 82763504
dataset_size: 0
- config_name: safety_rules
features:
- name: chosen
dtype: string
- name: rejected
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 50000
download_size: 37160189
dataset_size: 0
- config_name: smollm_cosmo_v2
features:
- name: prompt
dtype: string
- name: text
dtype: string
- name: token_length
dtype: int64
- name: audience
dtype: string
- name: format
dtype: string
- name: seed_data
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 1166672
download_size: 3581751761
dataset_size: 0
- config_name: ultrachat_logic
features:
- name: prompt
dtype: string
- name: prompt_id
dtype: string
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 69289
download_size: 441578154
dataset_size: 0
configs:
- config_name: cosmopedia_5gb
data_files:
- split: train
path: cosmopedia_5gb/train-*
- config_name: cosmopedia_basics
data_files:
- split: train
path: cosmopedia_basics/train-*
- config_name: dense_clean_pajama
data_files:
- split: train
path: dense_clean_pajama/train-*
- config_name: dense_knowledge_edu
data_files:
- split: train
path: dense_knowledge_edu/train-*
- config_name: dense_reasoning_hermes
data_files:
- split: train
path: dense_reasoning_hermes/train-*
- config_name: evol_instruct_code
data_files:
- split: train
path: evol_instruct_code/train-*
- config_name: multi_hindi
data_files:
- split: train
path: multi_hindi/train-*
- config_name: numinamath_cot
data_files:
- split: train
path: numinamath_cot/train-*
- config_name: openthoughts_cot
data_files:
- split: train
path: openthoughts_cot/train-*
- config_name: orca_math_cot
data_files:
- split: train
path: orca_math_cot/train-*
- config_name: safety_rules
data_files:
- split: train
path: safety_rules/train-*
- config_name: smollm_cosmo_v2
data_files:
- split: train
path: smollm_cosmo_v2/train-*
- config_name: ultrachat_logic
data_files:
- split: train
path: ultrachat_logic/train-*
---
提供机构:
SAIFIINDUSTRIES



