five

ddudek/nanochat-dclm-baseline-150b-shuffle

收藏
Hugging Face2026-01-25 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/ddudek/nanochat-dclm-baseline-150b-shuffle
下载链接
链接失效反馈
官方服务:
资源简介:
--- license: cc-by-4.0 datasets: - mlfoundations/dclm-baseline-1.0 language: - en size_categories: - 100B<n<1T dataset_info: features: - name: text dtype: string splits: - name: train num_examples: 117968896 download_size: 250795388 configs: - config_name: default data_files: - split: train path: shard_*.parquet --- # DCLM-Baseline 1.0 dataset (150BT/4% sample) This repository contains a repackaged version of the DCLM-Baseline 1.0 dataset for efficient use with Andrej Karpathy’s [Nanochat project](https://github.com/karpathy/nanochat). 117968896 docs, ~666 856 099 899 characters, 2621 shards. It is intended as a drop-in replacement for the FineWeb-Edu dataset. ## How to load the dataset The dataset format is identical to FineWeb. To use it with Nanochat, simply replace the dataset URL in `nanochat/dataset.py`: ``` BASE_URL = "https://huggingface.co/datasets/ddudek/nanochat-dclm-baseline-150b-shuffle/resolve/main" MAX_SHARD = 2621 ``` ## Scripts used: The dataset was created in two stages: ### 1. Sampling 4% docs from the dataset Sample size was targeted to be a bit more than original fineweb-edu. The script was a slightly modified original fineweb-edu repackage script with the following changes: - used streaming (because of the enormous size of dclm dataset) - no shuffling yet, as the streaming was used - added in-place random sampling Note: I recommend to write directly to Arrow format instead of Parquet at this stage. It's done either way later during shuffling, which added unnecessary overhead — but :shrug: ```python import os import time import random from datasets import load_dataset, Dataset import datasets import pyarrow.parquet as pq import pyarrow as pa dataset = load_dataset('mlfoundations/dclm-baseline-1.0-parquet', split='train', streaming=True) # <- set streaming ndocs = 2732074726 # as estimated by huggingface page, turned out to be a bit more sample_fraction = 0.04 # 4%, targeted to be a bit more than the original fineweb-edu dataset size ds = dataset random.seed(382) # needs ~256 GB of space output_folder = f"/mnt/THE_2TB/base_data_dclm_orig_nonshuffle" output_dir = os.path.join(base_dir, output_folder) os.makedirs(output_dir, exist_ok=True) processing_batch_size=16*1024 # Write to parquet files chars_per_shard = 250_000_000 row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later shard_docs = [] shard_index = 0 shard_characters = 0 total_docs_processed = 0 total_time_spent = 0 t0 = time.time() ds_iter = ds.iter(batch_size=processing_batch_size) total_docs_processed_sample = 0 for idx_batch, doc_batch in enumerate(ds_iter): for idx_doc, text in enumerate(doc_batch['text']): global_idx = idx_batch * processing_batch_size + idx_doc total_docs_processed += 1 # sampling here include_sample = random.random() < sample_fraction if not include_sample: continue # text = doc['text'] shard_docs.append(text) shard_characters += len(text) collected_enough_chars = shard_characters >= chars_per_shard docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0 if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed) shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet") shard_table = pa.Table.from_pydict({"text": shard_docs}) pq.write_table( shard_table, shard_path, row_group_size=row_group_size, use_dictionary=False, # this is usually used for categorical data compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’} compression_level=3, write_statistics=False, # not needed for text ) t1 = time.time() dt = t1 - t0 # for this shard alone t0 = t1 total_docs_processed_sample += len(shard_docs) total_time_spent += dt remaining_docs = ndocs - total_docs_processed avg_time_per_doc = total_time_spent / total_docs_processed remaining_time = remaining_docs * avg_time_per_doc remaining_time_hours = remaining_time / 3600 print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h") shard_docs = [] shard_characters = 0 shard_index += 1 num_synthetic_added = 0 ``` ### 2. Shuffling Once the full 4% sample was collected, the dataset was shuffled to randomize document order, and written out in Nanochat-optimized format: ```python import os import time import threading from queue import Queue from nanochat.common import get_base_dir_static from datasets import load_dataset import datasets import pyarrow.parquet as pq import pyarrow as pa import pyarrow.dataset as ds # Some config to improve speed datasets.config.IN_MEMORY_MAX_SIZE = 8*134217728 # 8*128MB writer_batch_size=64 * 1024 dclm = load_dataset( "parquet", data_files={"train": "/mnt/THE_2TB/base_data_dclm_orig_nonshuffle/shard_*.parquet"}, split="train", cache_dir="/mnt/FAST_1TB/base_data_dclm_orig_nonshuffle_cache", # <- ~670 GB needed for repackaging arrow, this could've been avoided if used arrow in the step 1 num_proc=10, ) ndocs = len(dclm) # 117968896 # Shuffle to scramble the order print("Shuffling dataset...") ds_shuf = dclm.shuffle(seed=42) # after this reading ds will be much slower, to mitigate use flatten_indices, but here we're repacking anyway print("Finished.") # Final step - package as for nanochat format output_dir = "/mnt/THE_2TB/hf_cache/nanochat-dclm-baseline-shuffle42" os.makedirs(output_dir, exist_ok=True) # some batching to improve performance processing_batch_size=16*1024 # Write to parquet files chars_per_shard = 250_000_000 row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later shard_docs = [] shard_index = 0 shard_characters = 0 total_docs_processed = 0 total_time_spent = 0 t0 = time.time() ds_iter = ds_shuf.iter(batch_size=processing_batch_size) total_docs_processed_sample = 0 for idx_batch, doc_batch in enumerate(ds_iter): for idx_doc, text in enumerate(doc_batch['text']): global_idx = idx_batch * processing_batch_size + idx_doc total_docs_processed += 1 # text = doc['text'] shard_docs.append(text) shard_characters += len(text) collected_enough_chars = shard_characters >= chars_per_shard docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0 if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed) shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet") shard_table = pa.Table.from_pydict({"text": shard_docs}) pq.write_table( shard_table, shard_path, row_group_size=row_group_size, use_dictionary=False, # this is usually used for categorical data compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’} compression_level=3, write_statistics=False, # not needed for text ) t1 = time.time() dt = t1 - t0 # for this shard alone t0 = t1 total_docs_processed_sample += len(shard_docs) total_time_spent += dt remaining_docs = ndocs - total_docs_processed avg_time_per_doc = total_time_spent / total_docs_processed remaining_time = remaining_docs * avg_time_per_doc remaining_time_hours = remaining_time / 3600 print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h") shard_docs = [] shard_characters = 0 shard_index += 1 num_synthetic_added = 0 ``` Uploaded using: ```python # ~666 856 099 899 characters, 117968896 docs repo_id="ddudek/nanochat-dclm-baseline-150b-shuffle" def upload(): import os from huggingface_hub import HfApi token = os.getenv("HF_TOKEN") api = HfApi(token=token) api.upload_large_folder( folder_path=output_dir, repo_id=repo_id, repo_type="dataset", ) upload() ```

许可证:CC BY 4.0 关联数据集: - mlfoundations/dclm-baseline-1.0 语言: - 英语(en) 规模类别: - 1000亿 < 样本数 < 1万亿 数据集信息: 特征: - 名称:text,数据类型:字符串(string) 数据划分: - 划分名称:train(训练集),样本数:117968896 下载大小:250795388字节 配置项: - 配置名称:default,数据文件: - 对应划分:train,路径:shard_*.parquet(Parquet格式分片文件) # DCLM-Baseline 1.0 数据集(1500亿Token/4%采样子集) 本仓库为适配Andrej Karpathy的[Nanochat项目](https://github.com/karpathy/nanochat)的高效使用,重新打包了DCLM-Baseline 1.0数据集。 该数据集包含117,968,896份文档,总字符数约为666,856,099,899,共2621个数据分片(shard)。 本数据集可作为FineWeb-Edu数据集的即插即用替代方案。 ## 数据集加载方法 该数据集格式与FineWeb完全一致。若要在Nanochat中使用,仅需修改`nanochat/dataset.py`中的数据集URL: BASE_URL = "https://huggingface.co/datasets/ddudek/nanochat-dclm-baseline-150b-shuffle/resolve/main" MAX_SHARD = 2621 ## 制作脚本说明 本数据集分为两个阶段制作完成: ### 1. 从原数据集采样4%的文档 采样规模设定为略大于原始FineWeb-Edu数据集。所用脚本基于原始FineWeb-Edu打包脚本稍加修改,改动如下: - 采用流式加载(streaming,因DCLM数据集体量巨大) - 因使用流式加载,暂不进行打乱操作 - 新增原地随机采样逻辑 注意:本阶段建议直接写入Arrow格式(Arrow)而非Parquet格式(Parquet)。后续打乱阶段仍需进行格式转换,会带来不必要的开销——但暂且如此。 python import os import time import random from datasets import load_dataset, Dataset import datasets import pyarrow.parquet as pq import pyarrow as pa dataset = load_dataset('mlfoundations/dclm-baseline-1.0-parquet', split='train', streaming=True) # <- set streaming ndocs = 2732074726 # as estimated by huggingface page, turned out to be a bit more sample_fraction = 0.04 # 4%, targeted to be a bit more than the original fineweb-edu dataset size ds = dataset random.seed(382) # needs ~256 GB of space output_folder = f"/mnt/THE_2TB/base_data_dclm_orig_nonshuffle" output_dir = os.path.join(base_dir, output_folder) os.makedirs(output_dir, exist_ok=True) processing_batch_size=16*1024 # Write to parquet files chars_per_shard = 250_000_000 row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later shard_docs = [] shard_index = 0 shard_characters = 0 total_docs_processed = 0 total_time_spent = 0 t0 = time.time() ds_iter = ds.iter(batch_size=processing_batch_size) total_docs_processed_sample = 0 for idx_batch, doc_batch in enumerate(ds_iter): for idx_doc, text in enumerate(doc_batch['text']): global_idx = idx_batch * processing_batch_size + idx_doc total_docs_processed += 1 # sampling here include_sample = random.random() < sample_fraction if not include_sample: continue # text = doc['text'] shard_docs.append(text) shard_characters += len(text) collected_enough_chars = shard_characters >= chars_per_shard docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0 if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed) shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet") shard_table = pa.Table.from_pydict({"text": shard_docs}) pq.write_table( shard_table, shard_path, row_group_size=row_group_size, use_dictionary=False, # this is usually used for categorical data compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’} compression_level=3, write_statistics=False, # not needed for text ) t1 = time.time() dt = t1 - t0 # for this shard alone t0 = t1 total_docs_processed_sample += len(shard_docs) total_time_spent += dt remaining_docs = ndocs - total_docs_processed avg_time_per_doc = total_time_spent / total_docs_processed remaining_time = remaining_docs * avg_time_per_doc remaining_time_hours = remaining_time / 3600 print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h") shard_docs = [] shard_characters = 0 shard_index += 1 num_synthetic_added = 0 ### 2. 数据打乱 完成全部4%采样子集的采集后,对数据集进行打乱以随机化文档顺序,并按照适配Nanochat的格式输出: python import os import time import threading from queue import Queue from nanochat.common import get_base_dir_static from datasets import load_dataset import datasets import pyarrow.parquet as pq import pyarrow as pa import pyarrow.dataset as ds # Some config to improve speed datasets.config.IN_MEMORY_MAX_SIZE = 8*134217728 # 8*128MB writer_batch_size=64 * 1024 dclm = load_dataset( "parquet", data_files={"train": "/mnt/THE_2TB/base_data_dclm_orig_nonshuffle/shard_*.parquet"}, split="train", cache_dir="/mnt/FAST_1TB/base_data_dclm_orig_nonshuffle_cache", # <- ~670 GB needed for repackaging arrow, this could've been avoided if used arrow in the step 1 num_proc=10, ) ndocs = len(dclm) # 117968896 # Shuffle to scramble the order print("Shuffling dataset...") ds_shuf = dclm.shuffle(seed=42) # after this reading ds will be much slower, to mitigate use flatten_indices, but here we're repacking anyway print("Finished.") # Final step - package as for nanochat format output_dir = "/mnt/THE_2TB/hf_cache/nanochat-dclm-baseline-shuffle42" os.makedirs(output_dir, exist_ok=True) # some batching to improve performance processing_batch_size=16*1024 # Write to parquet files chars_per_shard = 250_000_000 row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later shard_docs = [] shard_index = 0 shard_characters = 0 total_docs_processed = 0 total_time_spent = 0 t0 = time.time() ds_iter = ds_shuf.iter(batch_size=processing_batch_size) total_docs_processed_sample = 0 for idx_batch, doc_batch in enumerate(ds_iter): for idx_doc, text in enumerate(doc_batch['text']): global_idx = idx_batch * processing_batch_size + idx_doc total_docs_processed += 1 # text = doc['text'] shard_docs.append(text) shard_characters += len(text) collected_enough_chars = shard_characters >= chars_per_shard docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0 if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed) shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet") shard_table = pa.Table.from_pydict({"text": shard_docs}) pq.write_table( shard_table, shard_path, row_group_size=row_group_size, use_dictionary=False, # this is usually used for categorical data compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’} compression_level=3, write_statistics=False, # not needed for text ) t1 = time.time() dt = t1 - t0 # for this shard alone t0 = t1 total_docs_processed_sample += len(shard_docs) total_time_spent += dt remaining_docs = ndocs - total_docs_processed avg_time_per_doc = total_time_spent / total_docs_processed remaining_time = remaining_docs * avg_time_per_doc remaining_time_hours = remaining_time / 3600 print(f"[{(total_docs_processed/ndocs):.2f}%] Wrote {shard_path}, batch {idx_batch}., #documents: {len(shard_docs)} | total written: {total_docs_processed_sample} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h") shard_docs = [] shard_characters = 0 shard_index += 1 num_synthetic_added = 0 ## 上传脚本 python # ~666 856 099 899 characters, 117968896 docs repo_id="ddudek/nanochat-dclm-baseline-150b-shuffle" def upload(): import os from huggingface_hub import HfApi token = os.getenv("HF_TOKEN") api = HfApi(token=token) api.upload_large_folder( folder_path=output_dir, repo_id=repo_id, repo_type="dataset", ) upload()
提供机构:
ddudek
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作