Llasa_opensource_speech_data_160k_hours_tokenized
收藏魔搭社区2026-01-06 更新2025-02-01 收录
下载链接:
https://modelscope.cn/datasets/HKUSTAudio/Llasa_opensource_speech_data_160k_hours_tokenized
下载链接
链接失效反馈官方服务:
资源简介:
[](https://arxiv.org/abs/2502.04128)
**Update (2025-02-07):** Our paper has been released!
This script is for merging tokenized speech datasets stored in memmap format. The input datasets can be combined to form larger training datasets.
```python
import numpy as np
import os
def merge_memmap_datasets(dataset_dirs, output_dir):
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)
# Dataset splits to be merged
splits = ['train', 'val']
for split in splits:
shapes = []
seq_len = None
total_samples = 0
# Collect shapes of all datasets and check sequence length consistency
for dataset_dir in dataset_dirs:
shape_path = os.path.join(dataset_dir, f'{split}_input_ids_shape.npy')
if not os.path.exists(shape_path):
print(f"Warning: {split}_input_ids_shape.npy not found in {dataset_dir}, skipping this dataset.")
continue
shape = np.load(shape_path)
print(f"Loaded shape of {split} data from {dataset_dir}: {shape}")
shape = tuple(shape)
shapes.append((dataset_dir, shape))
total_samples += shape[0]
if seq_len is None:
seq_len = shape[1]
elif seq_len != shape[1]:
print(f"Error: Sequence length mismatch in {split} data from {dataset_dir}.")
return
if total_samples == 0:
print(f"Error: No valid {split} data found for merging.")
continue
new_shape = (total_samples, seq_len)
# Create new memmap file
output_memmap_path = os.path.join(output_dir, f'{split}_input_ids.memmap')
output_memmap = np.memmap(
output_memmap_path, dtype='int32', mode='w+', shape=new_shape
)
# Copy data from each dataset to the new memmap file
start_idx = 0
for dataset_dir, shape in shapes:
memmap_path = os.path.join(dataset_dir, f'{split}_input_ids.memmap')
data = np.memmap(
memmap_path, dtype='int32', mode='r', shape=shape
)
end_idx = start_idx + shape[0]
output_memmap[start_idx:end_idx, :] = data[:]
print(f"Merged {split} data from {dataset_dir} into positions {start_idx}:{end_idx}")
start_idx = end_idx
del data # Free memory
# Delete temporary variable and flush data to disk
del output_memmap
# Save the new shape file
np.save(os.path.join(output_dir, f'{split}_input_ids_shape.npy'), new_shape)
print(f"Completed merging {split} data. New shape: {new_shape}")
if __name__ == "__main__":
dataset_dirs = [
'libriheavy_tts_1',
'libriheavy_tts_2',
'libriheavy_tts_3',
'libriheavy_tts_4',
'emilia_en_1',
'emilia_en_2',
'emilia_en_3',
'emilia_en_4',
]
output_dir = 'libriheavy_tts_all'
merge_memmap_datasets(dataset_dirs, output_dir)
```
[](https://arxiv.org/abs/2502.04128)
**更新(2025年2月7日):** 我们的论文已正式发布!
本脚本用于合并以内存映射(memmap)格式存储的经过Token分词的语音数据集,可将多个输入数据集整合为更大规模的训练数据集。
python
import numpy as np
import os
def merge_memmap_datasets(dataset_dirs, output_dir):
# 确保输出目录已创建
os.makedirs(output_dir, exist_ok=True)
# 待合并的数据集拆分类型
splits = ['train', 'val']
for split in splits:
shapes = []
seq_len = None
total_samples = 0
# 遍历收集各数据集的形状,并校验序列长度一致性
for dataset_dir in dataset_dirs:
shape_path = os.path.join(dataset_dir, f'{split}_input_ids_shape.npy')
if not os.path.exists(shape_path):
print(f"Warning: {split}_input_ids_shape.npy not found in {dataset_dir}, skipping this dataset.")
continue
shape = np.load(shape_path)
print(f"Loaded shape of {split} data from {dataset_dir}: {shape}")
shape = tuple(shape)
shapes.append((dataset_dir, shape))
total_samples += shape[0]
if seq_len is None:
seq_len = shape[1]
elif seq_len != shape[1]:
print(f"Error: Sequence length mismatch in {split} data from {dataset_dir}.")
return
if total_samples == 0:
print(f"Error: No valid {split} data found for merging.")
continue
new_shape = (total_samples, seq_len)
# 创建新的内存映射文件
output_memmap_path = os.path.join(output_dir, f'{split}_input_ids.memmap')
output_memmap = np.memmap(
output_memmap_path, dtype='int32', mode='w+', shape=new_shape
)
# 将各数据集的数据复制到新的内存映射文件中
start_idx = 0
for dataset_dir, shape in shapes:
memmap_path = os.path.join(dataset_dir, f'{split}_input_ids.memmap')
data = np.memmap(
memmap_path, dtype='int32', mode='r', shape=shape
)
end_idx = start_idx + shape[0]
output_memmap[start_idx:end_idx, :] = data[:]
print(f"Merged {split} data from {dataset_dir} into positions {start_idx}:{end_idx}")
start_idx = end_idx
del data # 释放内存
# 删除临时变量并将数据刷新至磁盘
del output_memmap
# 保存新的形状文件
np.save(os.path.join(output_dir, f'{split}_input_ids_shape.npy'), new_shape)
print(f"Completed merging {split} data. New shape: {new_shape}")
if __name__ == "__main__":
dataset_dirs = [
'libriheavy_tts_1',
'libriheavy_tts_2',
'libriheavy_tts_3',
'libriheavy_tts_4',
'emilia_en_1',
'emilia_en_2',
'emilia_en_3',
'emilia_en_4',
]
output_dir = 'libriheavy_tts_all'
merge_memmap_datasets(dataset_dirs, output_dir)
提供机构:
maas
创建时间:
2025-02-06



