ddudek/nanochat-dclm-baseline-150b-shuffle
收藏Hugging Face2026-01-25 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/ddudek/nanochat-dclm-baseline-150b-shuffle
下载链接
链接失效反馈官方服务:
资源简介:
---
license: cc-by-4.0
datasets:
- mlfoundations/dclm-baseline-1.0
language:
- en
size_categories:
- 100B<n<1T
dataset_info:
features:
- name: text
dtype: string
splits:
- name: train
num_examples: 117968896
download_size: 250795388
configs:
- config_name: default
data_files:
- split: train
path: shard_*.parquet
---
# DCLM-Baseline 1.0 dataset (150BT/4% sample)
This repository contains a repackaged version of the DCLM-Baseline 1.0 dataset for efficient use with Andrej Karpathy’s [Nanochat project](https://github.com/karpathy/nanochat).
117968896 docs, ~666 856 099 899 characters, 2621 shards.
It is intended as a drop-in replacement for the FineWeb-Edu dataset.
## How to load the dataset
The dataset format is identical to FineWeb. To use it with Nanochat, simply replace the dataset URL in `nanochat/dataset.py`:
```
BASE_URL = "https://huggingface.co/datasets/ddudek/nanochat-dclm-baseline-150b-shuffle/resolve/main"
MAX_SHARD = 2621
```
## Scripts used:
The dataset was created in two stages:
### 1. Sampling 4% docs from the dataset
Sample size was targeted to be a bit more than original fineweb-edu. The script was a slightly modified original fineweb-edu repackage script with the following changes:
- used streaming (because of the enormous size of dclm dataset)
- no shuffling yet, as the streaming was used
- added in-place random sampling
Note: I recommend to write directly to Arrow format instead of Parquet at this stage. It's done either way later during shuffling, which added unnecessary overhead — but :shrug:
```python
import os
import time
import random
from datasets import load_dataset, Dataset
import datasets
import pyarrow.parquet as pq
import pyarrow as pa
dataset = load_dataset('mlfoundations/dclm-baseline-1.0-parquet', split='train', streaming=True) # <- set streaming
ndocs = 2732074726 # as estimated by huggingface page, turned out to be a bit more
sample_fraction = 0.04 # 4%, targeted to be a bit more than the original fineweb-edu dataset size
ds = dataset
random.seed(382)
# needs ~256 GB of space
output_folder = f"/mnt/THE_2TB/base_data_dclm_orig_nonshuffle"
output_dir = os.path.join(base_dir, output_folder)
os.makedirs(output_dir, exist_ok=True)
processing_batch_size=16*1024
# Write to parquet files
chars_per_shard = 250_000_000
row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
shard_docs = []
shard_index = 0
shard_characters = 0
total_docs_processed = 0
total_time_spent = 0
t0 = time.time()
ds_iter = ds.iter(batch_size=processing_batch_size)
total_docs_processed_sample = 0
for idx_batch, doc_batch in enumerate(ds_iter):
for idx_doc, text in enumerate(doc_batch['text']):
global_idx = idx_batch * processing_batch_size + idx_doc
total_docs_processed += 1
# sampling here
include_sample = random.random() < sample_fraction
if not include_sample:
continue
# text = doc['text']
shard_docs.append(text)
shard_characters += len(text)
collected_enough_chars = shard_characters >= chars_per_shard
docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
shard_table = pa.Table.from_pydict({"text": shard_docs})
pq.write_table(
shard_table,
shard_path,
row_group_size=row_group_size,
use_dictionary=False, # this is usually used for categorical data
compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
compression_level=3,
write_statistics=False, # not needed for text
)
t1 = time.time()
dt = t1 - t0 # for this shard alone
t0 = t1
total_docs_processed_sample += len(shard_docs)
total_time_spent += dt
remaining_docs = ndocs - total_docs_processed
avg_time_per_doc = total_time_spent / total_docs_processed
remaining_time = remaining_docs * avg_time_per_doc
remaining_time_hours = remaining_time / 3600
print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
shard_docs = []
shard_characters = 0
shard_index += 1
num_synthetic_added = 0
```
### 2. Shuffling
Once the full 4% sample was collected, the dataset was shuffled to randomize document order, and written out in Nanochat-optimized format:
```python
import os
import time
import threading
from queue import Queue
from nanochat.common import get_base_dir_static
from datasets import load_dataset
import datasets
import pyarrow.parquet as pq
import pyarrow as pa
import pyarrow.dataset as ds
# Some config to improve speed
datasets.config.IN_MEMORY_MAX_SIZE = 8*134217728 # 8*128MB
writer_batch_size=64 * 1024
dclm = load_dataset(
"parquet",
data_files={"train": "/mnt/THE_2TB/base_data_dclm_orig_nonshuffle/shard_*.parquet"},
split="train",
cache_dir="/mnt/FAST_1TB/base_data_dclm_orig_nonshuffle_cache", # <- ~670 GB needed for repackaging arrow, this could've been avoided if used arrow in the step 1
num_proc=10,
)
ndocs = len(dclm) # 117968896
# Shuffle to scramble the order
print("Shuffling dataset...")
ds_shuf = dclm.shuffle(seed=42) # after this reading ds will be much slower, to mitigate use flatten_indices, but here we're repacking anyway
print("Finished.")
# Final step - package as for nanochat format
output_dir = "/mnt/THE_2TB/hf_cache/nanochat-dclm-baseline-shuffle42"
os.makedirs(output_dir, exist_ok=True)
# some batching to improve performance
processing_batch_size=16*1024
# Write to parquet files
chars_per_shard = 250_000_000
row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
shard_docs = []
shard_index = 0
shard_characters = 0
total_docs_processed = 0
total_time_spent = 0
t0 = time.time()
ds_iter = ds_shuf.iter(batch_size=processing_batch_size)
total_docs_processed_sample = 0
for idx_batch, doc_batch in enumerate(ds_iter):
for idx_doc, text in enumerate(doc_batch['text']):
global_idx = idx_batch * processing_batch_size + idx_doc
total_docs_processed += 1
# text = doc['text']
shard_docs.append(text)
shard_characters += len(text)
collected_enough_chars = shard_characters >= chars_per_shard
docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
shard_table = pa.Table.from_pydict({"text": shard_docs})
pq.write_table(
shard_table,
shard_path,
row_group_size=row_group_size,
use_dictionary=False, # this is usually used for categorical data
compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
compression_level=3,
write_statistics=False, # not needed for text
)
t1 = time.time()
dt = t1 - t0 # for this shard alone
t0 = t1
total_docs_processed_sample += len(shard_docs)
total_time_spent += dt
remaining_docs = ndocs - total_docs_processed
avg_time_per_doc = total_time_spent / total_docs_processed
remaining_time = remaining_docs * avg_time_per_doc
remaining_time_hours = remaining_time / 3600
print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
shard_docs = []
shard_characters = 0
shard_index += 1
num_synthetic_added = 0
```
Uploaded using:
```python
# ~666 856 099 899 characters, 117968896 docs
repo_id="ddudek/nanochat-dclm-baseline-150b-shuffle"
def upload():
import os
from huggingface_hub import HfApi
token = os.getenv("HF_TOKEN")
api = HfApi(token=token)
api.upload_large_folder(
folder_path=output_dir,
repo_id=repo_id,
repo_type="dataset",
)
upload()
```
许可证:CC BY 4.0
关联数据集:
- mlfoundations/dclm-baseline-1.0
语言:
- 英语(en)
规模类别:
- 1000亿 < 样本数 < 1万亿
数据集信息:
特征:
- 名称:text,数据类型:字符串(string)
数据划分:
- 划分名称:train(训练集),样本数:117968896
下载大小:250795388字节
配置项:
- 配置名称:default,数据文件:
- 对应划分:train,路径:shard_*.parquet(Parquet格式分片文件)
# DCLM-Baseline 1.0 数据集(1500亿Token/4%采样子集)
本仓库为适配Andrej Karpathy的[Nanochat项目](https://github.com/karpathy/nanochat)的高效使用,重新打包了DCLM-Baseline 1.0数据集。
该数据集包含117,968,896份文档,总字符数约为666,856,099,899,共2621个数据分片(shard)。
本数据集可作为FineWeb-Edu数据集的即插即用替代方案。
## 数据集加载方法
该数据集格式与FineWeb完全一致。若要在Nanochat中使用,仅需修改`nanochat/dataset.py`中的数据集URL:
BASE_URL = "https://huggingface.co/datasets/ddudek/nanochat-dclm-baseline-150b-shuffle/resolve/main"
MAX_SHARD = 2621
## 制作脚本说明
本数据集分为两个阶段制作完成:
### 1. 从原数据集采样4%的文档
采样规模设定为略大于原始FineWeb-Edu数据集。所用脚本基于原始FineWeb-Edu打包脚本稍加修改,改动如下:
- 采用流式加载(streaming,因DCLM数据集体量巨大)
- 因使用流式加载,暂不进行打乱操作
- 新增原地随机采样逻辑
注意:本阶段建议直接写入Arrow格式(Arrow)而非Parquet格式(Parquet)。后续打乱阶段仍需进行格式转换,会带来不必要的开销——但暂且如此。
python
import os
import time
import random
from datasets import load_dataset, Dataset
import datasets
import pyarrow.parquet as pq
import pyarrow as pa
dataset = load_dataset('mlfoundations/dclm-baseline-1.0-parquet', split='train', streaming=True) # <- set streaming
ndocs = 2732074726 # as estimated by huggingface page, turned out to be a bit more
sample_fraction = 0.04 # 4%, targeted to be a bit more than the original fineweb-edu dataset size
ds = dataset
random.seed(382)
# needs ~256 GB of space
output_folder = f"/mnt/THE_2TB/base_data_dclm_orig_nonshuffle"
output_dir = os.path.join(base_dir, output_folder)
os.makedirs(output_dir, exist_ok=True)
processing_batch_size=16*1024
# Write to parquet files
chars_per_shard = 250_000_000
row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
shard_docs = []
shard_index = 0
shard_characters = 0
total_docs_processed = 0
total_time_spent = 0
t0 = time.time()
ds_iter = ds.iter(batch_size=processing_batch_size)
total_docs_processed_sample = 0
for idx_batch, doc_batch in enumerate(ds_iter):
for idx_doc, text in enumerate(doc_batch['text']):
global_idx = idx_batch * processing_batch_size + idx_doc
total_docs_processed += 1
# sampling here
include_sample = random.random() < sample_fraction
if not include_sample:
continue
# text = doc['text']
shard_docs.append(text)
shard_characters += len(text)
collected_enough_chars = shard_characters >= chars_per_shard
docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
shard_table = pa.Table.from_pydict({"text": shard_docs})
pq.write_table(
shard_table,
shard_path,
row_group_size=row_group_size,
use_dictionary=False, # this is usually used for categorical data
compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
compression_level=3,
write_statistics=False, # not needed for text
)
t1 = time.time()
dt = t1 - t0 # for this shard alone
t0 = t1
total_docs_processed_sample += len(shard_docs)
total_time_spent += dt
remaining_docs = ndocs - total_docs_processed
avg_time_per_doc = total_time_spent / total_docs_processed
remaining_time = remaining_docs * avg_time_per_doc
remaining_time_hours = remaining_time / 3600
print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
shard_docs = []
shard_characters = 0
shard_index += 1
num_synthetic_added = 0
### 2. 数据打乱
完成全部4%采样子集的采集后,对数据集进行打乱以随机化文档顺序,并按照适配Nanochat的格式输出:
python
import os
import time
import threading
from queue import Queue
from nanochat.common import get_base_dir_static
from datasets import load_dataset
import datasets
import pyarrow.parquet as pq
import pyarrow as pa
import pyarrow.dataset as ds
# Some config to improve speed
datasets.config.IN_MEMORY_MAX_SIZE = 8*134217728 # 8*128MB
writer_batch_size=64 * 1024
dclm = load_dataset(
"parquet",
data_files={"train": "/mnt/THE_2TB/base_data_dclm_orig_nonshuffle/shard_*.parquet"},
split="train",
cache_dir="/mnt/FAST_1TB/base_data_dclm_orig_nonshuffle_cache", # <- ~670 GB needed for repackaging arrow, this could've been avoided if used arrow in the step 1
num_proc=10,
)
ndocs = len(dclm) # 117968896
# Shuffle to scramble the order
print("Shuffling dataset...")
ds_shuf = dclm.shuffle(seed=42) # after this reading ds will be much slower, to mitigate use flatten_indices, but here we're repacking anyway
print("Finished.")
# Final step - package as for nanochat format
output_dir = "/mnt/THE_2TB/hf_cache/nanochat-dclm-baseline-shuffle42"
os.makedirs(output_dir, exist_ok=True)
# some batching to improve performance
processing_batch_size=16*1024
# Write to parquet files
chars_per_shard = 250_000_000
row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
shard_docs = []
shard_index = 0
shard_characters = 0
total_docs_processed = 0
total_time_spent = 0
t0 = time.time()
ds_iter = ds_shuf.iter(batch_size=processing_batch_size)
total_docs_processed_sample = 0
for idx_batch, doc_batch in enumerate(ds_iter):
for idx_doc, text in enumerate(doc_batch['text']):
global_idx = idx_batch * processing_batch_size + idx_doc
total_docs_processed += 1
# text = doc['text']
shard_docs.append(text)
shard_characters += len(text)
collected_enough_chars = shard_characters >= chars_per_shard
docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
shard_table = pa.Table.from_pydict({"text": shard_docs})
pq.write_table(
shard_table,
shard_path,
row_group_size=row_group_size,
use_dictionary=False, # this is usually used for categorical data
compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
compression_level=3,
write_statistics=False, # not needed for text
)
t1 = time.time()
dt = t1 - t0 # for this shard alone
t0 = t1
total_docs_processed_sample += len(shard_docs)
total_time_spent += dt
remaining_docs = ndocs - total_docs_processed
avg_time_per_doc = total_time_spent / total_docs_processed
remaining_time = remaining_docs * avg_time_per_doc
remaining_time_hours = remaining_time / 3600
print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
shard_docs = []
shard_characters = 0
shard_index += 1
num_synthetic_added = 0
## 上传脚本
python
# ~666 856 099 899 characters, 117968896 docs
repo_id="ddudek/nanochat-dclm-baseline-150b-shuffle"
def upload():
import os
from huggingface_hub import HfApi
token = os.getenv("HF_TOKEN")
api = HfApi(token=token)
api.upload_large_folder(
folder_path=output_dir,
repo_id=repo_id,
repo_type="dataset",
)
upload()
提供机构:
ddudek



