kushalt/fineweb-edu-gpt2
收藏Hugging Face2026-02-09 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/kushalt/fineweb-edu-gpt2
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: sample-10BT_max_length_1025
features:
- name: token_count
dtype: int64
- name: input_ids
list: uint16
- name: pad_mask
list: uint16
- name: sequence_ids
list: uint16
- name: token_density
dtype: float64
- name: tokenizer
dtype: string
- name: max_length
dtype: int64
- name: total_tokens
dtype: int64
- name: train_portion
dtype: float64
- name: test_portion
dtype: float64
splits:
- name: train
num_bytes: 60043955332
num_examples: 9656474
- name: test
num_bytes: 97168686
num_examples: 15627
download_size: 11295485515
dataset_size: 60141124018
- config_name: sample-10BT_max_length_2049
features:
- name: token_count
dtype: int64
- name: input_ids
list: uint16
- name: pad_mask
list: uint16
- name: sequence_ids
list: uint16
- name: token_density
dtype: float64
- name: tokenizer
dtype: string
- name: max_length
dtype: int64
- name: total_tokens
dtype: int64
- name: train_portion
dtype: float64
- name: test_portion
dtype: float64
splits:
- name: train
num_bytes: 119411962838
num_examples: 9659599
- name: test
num_bytes: 154549724
num_examples: 12502
download_size: 14113360857
dataset_size: 119566512562
- config_name: sample-10BT_max_length_257
features:
- name: token_count
dtype: int64
- name: input_ids
list: uint16
- name: pad_mask
list: uint16
- name: sequence_ids
list: uint16
- name: token_density
dtype: float64
- name: tokenizer
dtype: string
- name: max_length
dtype: int64
- name: total_tokens
dtype: int64
- name: train_portion
dtype: float64
- name: test_portion
dtype: float64
splits:
- name: train
num_bytes: 15506586200
num_examples: 9631420
- name: test
num_bytes: 65496410
num_examples: 40681
download_size: 4463727781
dataset_size: 15572082610
- config_name: sample-10BT_max_length_4097
features:
- name: token_count
dtype: int64
- name: input_ids
list: uint16
- name: pad_mask
list: uint16
- name: sequence_ids
list: uint16
- name: token_density
dtype: float64
- name: tokenizer
dtype: string
- name: max_length
dtype: int64
- name: total_tokens
dtype: int64
- name: train_portion
dtype: float64
- name: test_portion
dtype: float64
splits:
- name: train
num_bytes: 238144512750
num_examples: 9661035
- name: test
num_bytes: 272776900
num_examples: 11066
download_size: 16114790322
dataset_size: 238417289650
- config_name: sample-10BT_max_length_513
features:
- name: token_count
dtype: int64
- name: input_ids
list: uint16
- name: pad_mask
list: uint16
- name: sequence_ids
list: uint16
- name: token_density
dtype: float64
- name: tokenizer
dtype: string
- name: max_length
dtype: int64
- name: total_tokens
dtype: int64
- name: train_portion
dtype: float64
- name: test_portion
dtype: float64
splits:
- name: train
num_bytes: 30355083902.0
num_examples: 9648787
- name: test
num_bytes: 73345844.0
num_examples: 23314
download_size: 9012988777
dataset_size: 30428429746.0
configs:
- config_name: sample-10BT_max_length_1025
data_files:
- split: train
path: sample-10BT_max_length_1025/train-*
- split: test
path: sample-10BT_max_length_1025/test-*
- config_name: sample-10BT_max_length_2049
data_files:
- split: train
path: sample-10BT_max_length_2049/train-*
- split: test
path: sample-10BT_max_length_2049/test-*
- config_name: sample-10BT_max_length_257
data_files:
- split: train
path: sample-10BT_max_length_257/train-*
- split: test
path: sample-10BT_max_length_257/test-*
- config_name: sample-10BT_max_length_4097
data_files:
- split: train
path: sample-10BT_max_length_4097/train-*
- split: test
path: sample-10BT_max_length_4097/test-*
- config_name: sample-10BT_max_length_513
data_files:
- split: train
path: sample-10BT_max_length_513/train-*
- split: test
path: sample-10BT_max_length_513/test-*
---
提供机构:
kushalt



