ibm-aimc/cosmopedia-4k
收藏Hugging Face2024-05-29 更新2026-04-05 收录
下载链接:
https://hf-mirror.com/datasets/ibm-aimc/cosmopedia-4k
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: auto_math_text
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 7563022920.0
num_examples: 307590
- name: test
num_bytes: 76394916.0
num_examples: 3107
download_size: 2223802526
dataset_size: 7639417836.0
- config_name: auto_math_text_and_metamathQA
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 8169779044.07883
num_examples: 334424
- name: test
num_bytes: 77146266.06434317
num_examples: 3379
download_size: 2365400133
dataset_size: 8246925310.143173
- config_name: auto_math_text_metamathQA_mathinstruct
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 8502358284.0
num_examples: 345793
- name: test
num_bytes: 85910472.0
num_examples: 3494
download_size: 2447268798
dataset_size: 8588268756.0
- config_name: auto_math_text_metamathQA_mathinstruct_math
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 8502358284
num_examples: 345793
- name: test
num_bytes: 85910472
num_examples: 3494
download_size: 2447268798
dataset_size: 8588268756
- config_name: khanacademy
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 124907040.0
num_examples: 5080
- name: test
num_bytes: 1278576.0
num_examples: 52
download_size: 36745251
dataset_size: 126185616.0
- config_name: metamathQA
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int64
- name: special_tokens_mask
sequence: int64
splits:
- name: train
num_bytes: 2007825592
num_examples: 24506
- name: test
num_bytes: 20073340
num_examples: 245
download_size: 144507072
dataset_size: 2027898932
- config_name: openstax
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 631198548.0
num_examples: 25671
- name: test
num_bytes: 6392880.0
num_examples: 260
download_size: 198464341
dataset_size: 637591428.0
- config_name: stanford
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 6154868160.0
num_examples: 250320
- name: test
num_bytes: 62183052.0
num_examples: 2529
download_size: 1951182516
dataset_size: 6217051212.0
- config_name: stories
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 16913175444.0
num_examples: 687863
- name: test
num_bytes: 170862012.0
num_examples: 6949
download_size: 5305306879
dataset_size: 17084037456.0
- config_name: web_samples_v1
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 61314972660.0
num_examples: 2493695
- name: test
num_bytes: 619347132.0
num_examples: 25189
download_size: 19279180115
dataset_size: 61934319792.0
- config_name: web_samples_v2
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 49068009504.0
num_examples: 1995608
- name: test
num_bytes: 495644904.0
num_examples: 20158
download_size: 15406933375
dataset_size: 49563654408.0
- config_name: wikihow
features:
- name: input_ids
sequence: int32
- name: attention_mask
sequence: int8
- name: special_tokens_mask
sequence: int8
splits:
- name: train
num_bytes: 1071274572.0
num_examples: 43569
- name: test
num_bytes: 10843308.0
num_examples: 441
download_size: 324336853
dataset_size: 1082117880.0
configs:
- config_name: auto_math_text
data_files:
- split: train
path: auto_math_text/train-*
- split: test
path: auto_math_text/test-*
- config_name: auto_math_text_and_metamathQA
data_files:
- split: train
path: auto_math_text_and_metamathQA/train-*
- split: test
path: auto_math_text_and_metamathQA/test-*
- config_name: auto_math_text_metamathQA_mathinstruct
data_files:
- split: train
path: auto_math_text_metamathQA_mathinstruct/train-*
- split: test
path: auto_math_text_metamathQA_mathinstruct/test-*
- config_name: auto_math_text_metamathQA_mathinstruct_math
data_files:
- split: train
path: auto_math_text_metamathQA_mathinstruct_math/train-*
- split: test
path: auto_math_text_metamathQA_mathinstruct_math/test-*
- config_name: khanacademy
data_files:
- split: train
path: khanacademy/train-*
- split: test
path: khanacademy/test-*
- config_name: metamathQA
data_files:
- split: train
path: metamathQA/train-*
- split: test
path: metamathQA/test-*
- config_name: openstax
data_files:
- split: train
path: openstax/train-*
- split: test
path: openstax/test-*
- config_name: stanford
data_files:
- split: train
path: stanford/train-*
- split: test
path: stanford/test-*
- config_name: stories
data_files:
- split: train
path: stories/train-*
- split: test
path: stories/test-*
- config_name: web_samples_v1
data_files:
- split: train
path: web_samples_v1/train-*
- split: test
path: web_samples_v1/test-*
- config_name: web_samples_v2
data_files:
- split: train
path: web_samples_v2/train-*
- split: test
path: web_samples_v2/test-*
- config_name: wikihow
data_files:
- split: train
path: wikihow/train-*
- split: test
path: wikihow/test-*
---
提供机构:
ibm-aimc



