SAIFIINDUSTRIES/training_data_2
收藏Hugging Face2026-04-14 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/SAIFIINDUSTRIES/training_data_2
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: cosmopedia_5gb
features:
- name: text_token_length
dtype: int64
- name: prompt
dtype: string
- name: text
dtype: string
- name: seed_data
dtype: string
- name: format
dtype: string
- name: audience
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 529410
download_size: 1632209671
dataset_size: 0
- config_name: dense_clean_pajama
features:
- name: text
dtype: string
- name: meta
struct:
- name: redpajama_set_name
dtype: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 0
num_examples: 1129030
download_size: 2861365836
dataset_size: 0
- config_name: dense_knowledge_edu
features:
- name: text
dtype: string
- name: id
dtype: string
- name: dump
dtype: string
- name: url
dtype: string
- name: file_path
dtype: string
- name: language
dtype: string
- name: language_score
dtype: float64
- name: token_count
dtype: int64
- name: score
dtype: float64
- name: int_score
dtype: int64
splits:
- name: train
num_bytes: 0
num_examples: 1666663
download_size: 4809242032
dataset_size: 0
- config_name: dense_reasoning_hermes
features:
- name: custom_instruction
dtype: bool
- name: topic
dtype: string
- name: model_name
dtype: string
- name: model
dtype: string
- name: skip_prompt_formatting
dtype: bool
- name: category
dtype: string
- name: conversations
list:
- name: from
dtype: string
- name: value
dtype: string
- name: weight
dtype: float64
- name: views
dtype: int64
- name: language
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: idx
dtype: string
- name: hash
list: int64
- name: avatarUrl
dtype: string
- name: system_prompt
dtype: string
- name: source
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 250388
download_size: 406401718
dataset_size: 0
- config_name: multi_hindi
features:
- name: text
dtype: string
- name: timestamp
dtype: string
- name: url
dtype: string
- name: source
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 180000
download_size: 513070836
dataset_size: 0
- config_name: numinamath_cot
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 200000
download_size: 136997494
dataset_size: 0
- config_name: openthoughts_cot
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 37986
download_size: 362775053
dataset_size: 0
- config_name: smollm_cosmo_v2
features:
- name: prompt
dtype: string
- name: text
dtype: string
- name: token_length
dtype: int64
- name: audience
dtype: string
- name: format
dtype: string
- name: seed_data
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 1166669
download_size: 3582341033
dataset_size: 0
- config_name: ultrachat_logic
features:
- name: prompt
dtype: string
- name: prompt_id
dtype: string
- name: messages
list:
- name: content
dtype: string
- name: role
dtype: string
splits:
- name: train
num_bytes: 0
num_examples: 69288
download_size: 441963673
dataset_size: 0
configs:
- config_name: cosmopedia_5gb
data_files:
- split: train
path: cosmopedia_5gb/train-*
- config_name: dense_clean_pajama
data_files:
- split: train
path: dense_clean_pajama/train-*
- config_name: dense_knowledge_edu
data_files:
- split: train
path: dense_knowledge_edu/train-*
- config_name: dense_reasoning_hermes
data_files:
- split: train
path: dense_reasoning_hermes/train-*
- config_name: multi_hindi
data_files:
- split: train
path: multi_hindi/train-*
- config_name: numinamath_cot
data_files:
- split: train
path: numinamath_cot/train-*
- config_name: openthoughts_cot
data_files:
- split: train
path: openthoughts_cot/train-*
- config_name: smollm_cosmo_v2
data_files:
- split: train
path: smollm_cosmo_v2/train-*
- config_name: ultrachat_logic
data_files:
- split: train
path: ultrachat_logic/train-*
---
提供机构:
SAIFIINDUSTRIES



