下载链接：

https://modelscope.cn/datasets/HKUSTAudio/Llasa_opensource_speech_data_160k_hours_tokenized

下载链接

链接失效反馈

官方服务：

资源简介：

[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2502.04128) **Update (2025-02-07):** Our paper has been released! This script is for merging tokenized speech datasets stored in memmap format. The input datasets can be combined to form larger training datasets. ```python import numpy as np import os def merge_memmap_datasets(dataset_dirs, output_dir): # Ensure the output directory exists os.makedirs(output_dir, exist_ok=True) # Dataset splits to be merged splits = ['train', 'val'] for split in splits: shapes = [] seq_len = None total_samples = 0 # Collect shapes of all datasets and check sequence length consistency for dataset_dir in dataset_dirs: shape_path = os.path.join(dataset_dir, f'{split}_input_ids_shape.npy') if not os.path.exists(shape_path): print(f"Warning: {split}_input_ids_shape.npy not found in {dataset_dir}, skipping this dataset.") continue shape = np.load(shape_path) print(f"Loaded shape of {split} data from {dataset_dir}: {shape}") shape = tuple(shape) shapes.append((dataset_dir, shape)) total_samples += shape[0] if seq_len is None: seq_len = shape[1] elif seq_len != shape[1]: print(f"Error: Sequence length mismatch in {split} data from {dataset_dir}.") return if total_samples == 0: print(f"Error: No valid {split} data found for merging.") continue new_shape = (total_samples, seq_len) # Create new memmap file output_memmap_path = os.path.join(output_dir, f'{split}_input_ids.memmap') output_memmap = np.memmap( output_memmap_path, dtype='int32', mode='w+', shape=new_shape ) # Copy data from each dataset to the new memmap file start_idx = 0 for dataset_dir, shape in shapes: memmap_path = os.path.join(dataset_dir, f'{split}_input_ids.memmap') data = np.memmap( memmap_path, dtype='int32', mode='r', shape=shape ) end_idx = start_idx + shape[0] output_memmap[start_idx:end_idx, :] = data[:] print(f"Merged {split} data from {dataset_dir} into positions {start_idx}:{end_idx}") start_idx = end_idx del data # Free memory # Delete temporary variable and flush data to disk del output_memmap # Save the new shape file np.save(os.path.join(output_dir, f'{split}_input_ids_shape.npy'), new_shape) print(f"Completed merging {split} data. New shape: {new_shape}") if __name__ == "__main__": dataset_dirs = [ 'libriheavy_tts_1', 'libriheavy_tts_2', 'libriheavy_tts_3', 'libriheavy_tts_4', 'emilia_en_1', 'emilia_en_2', 'emilia_en_3', 'emilia_en_4', ] output_dir = 'libriheavy_tts_all' merge_memmap_datasets(dataset_dirs, output_dir) ```

[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2502.04128) **更新（2025年2月7日）：** 我们的论文已正式发布！本脚本用于合并以内存映射（memmap）格式存储的经过Token分词的语音数据集，可将多个输入数据集整合为更大规模的训练数据集。 python import numpy as np import os def merge_memmap_datasets(dataset_dirs, output_dir): # 确保输出目录已创建 os.makedirs(output_dir, exist_ok=True) # 待合并的数据集拆分类型 splits = ['train', 'val'] for split in splits: shapes = [] seq_len = None total_samples = 0 # 遍历收集各数据集的形状，并校验序列长度一致性 for dataset_dir in dataset_dirs: shape_path = os.path.join(dataset_dir, f'{split}_input_ids_shape.npy') if not os.path.exists(shape_path): print(f"Warning: {split}_input_ids_shape.npy not found in {dataset_dir}, skipping this dataset.") continue shape = np.load(shape_path) print(f"Loaded shape of {split} data from {dataset_dir}: {shape}") shape = tuple(shape) shapes.append((dataset_dir, shape)) total_samples += shape[0] if seq_len is None: seq_len = shape[1] elif seq_len != shape[1]: print(f"Error: Sequence length mismatch in {split} data from {dataset_dir}.") return if total_samples == 0: print(f"Error: No valid {split} data found for merging.") continue new_shape = (total_samples, seq_len) # 创建新的内存映射文件 output_memmap_path = os.path.join(output_dir, f'{split}_input_ids.memmap') output_memmap = np.memmap( output_memmap_path, dtype='int32', mode='w+', shape=new_shape ) # 将各数据集的数据复制到新的内存映射文件中 start_idx = 0 for dataset_dir, shape in shapes: memmap_path = os.path.join(dataset_dir, f'{split}_input_ids.memmap') data = np.memmap( memmap_path, dtype='int32', mode='r', shape=shape ) end_idx = start_idx + shape[0] output_memmap[start_idx:end_idx, :] = data[:] print(f"Merged {split} data from {dataset_dir} into positions {start_idx}:{end_idx}") start_idx = end_idx del data # 释放内存 # 删除临时变量并将数据刷新至磁盘 del output_memmap # 保存新的形状文件 np.save(os.path.join(output_dir, f'{split}_input_ids_shape.npy'), new_shape) print(f"Completed merging {split} data. New shape: {new_shape}") if __name__ == "__main__": dataset_dirs = [ 'libriheavy_tts_1', 'libriheavy_tts_2', 'libriheavy_tts_3', 'libriheavy_tts_4', 'emilia_en_1', 'emilia_en_2', 'emilia_en_3', 'emilia_en_4', ] output_dir = 'libriheavy_tts_all' merge_memmap_datasets(dataset_dirs, output_dir)

应用场景：