georgechang8/ASCEND_CLEAN
收藏数据集概述
基本信息
- 语言: 英语, 简体中文
- 许可证: CC-BY-SA-4.0
- 数据集大小: 10K<n<100K
数据集配置
-
默认配置:
- 特征:
id: 字符串path: 字符串audio: 音频,采样率16000transcription: 字符串duration: 浮点数language: 字符串original_speaker_id: 整数session_id: 整数topic: 字符串
- 分割:
train: 9869个样本, 1014558975.36字节test: 1315个样本, 106170264.135字节validation: 1130个样本, 106771606.91字节
- 下载大小: 1223500329字节
- 数据集大小: 1227500846.4050002字节
- 特征:
-
30s配置:
- 数据文件:
train: 30s/train-*validation: 30s/validation-*test: 30s/test-*
- 数据文件:
数据处理
-
加载源数据: python from datasets import load_dataset, Audio as DSAudio data_raw = load_dataset("CAiRE/ASCEND") data_raw = data_raw.cast_column("audio", DSAudio(sampling_rate=16000))
-
清理停用词: python import re
def clean_transcripts(x): cjk = "[u3400-u4db5u4e00-u9fa5u9fa6-u9fbbuf900-ufa2dufa30-ufa6aufa70-ufad9uff00-uffefu2e80-u2effu3000-u303fu31c0-u31efu2f00-u2fdfu2ff0-u2fffu3100-u312fu31a0-u31bfufe10-ufe1fufe30-ufe4fu2600-u26ffu2700-u27bfu3200-u32ffu3300-u33ff]" x = re.sub(r...|s|^|$, , x) # expanding space allows matching " uh uh" case x = re.sub(rf"({cjk}|s)([Uu][mh]|U[MH])({cjk}|s)", r"1 3", x) # replace any uh surrounded by cjk or space x = x.replace(嗯, ) x = x.replace(呃, ) x = re.sub(r"s+", " ", x) return x.strip()
data = data_raw.map(lambda x: {"transcription": clean_transcripts(x[transcription])}) data = data.filter(lambda x: x["transcription"] != "")
-
隔离包含UNK的样本: python unks = data.filter(lambda x: "[UNK]" in x["transcription"]) unks.shape
{train: (402, 9), test: (36, 9), validation: (63, 9)}
-
加载whisper模型: python from stable_whisper import load_faster_whisper model = load_faster_whisper( "medium", device="cuda", compute_type="float16", )
-
使用whisper-medium解析UNK: python from sacrebleu.tokenizers.tokenizer_zh import TokenizerZh from whisper_normalizer.basic import BasicTextNormalizer import cn2an import json import jiwer from tqdm.auto import tqdm
sacretok = TokenizerZh() whisper_norm = BasicTextNormalizer() def compute_mer(hyp, ref): def norm(x): return sacretok(cn2an.transform(whisper_norm(x), "an2cn")) return jiwer.process_words(norm(hyp), norm(ref)).wer * 100
adjusted = {split:dict() for split in data} double_check = {split:dict() for split in data}
UNK = "[UNK]"
for split in data: trange = tqdm(unks[split], desc=split) for i,sample in enumerate(trange): transcription = sample[transcription] texts = transcription.split(UNK) words = [] for sent in texts[1:]: for w in sacretok(sent).split(): if w not in words: words += [w] keyword = "关键词" header = "字幕" prompt = f"{keyword} "{/.join(words)}" {header} " result = model.transcribe_stable( audio=sample[audio][array], initial_prompt=prompt, # encourage reuse of words prefix=texts[0], # forcing start to follow real start language=sample[language].replace(mixed, zh), regroup=False, verbose=None, no_speech_threshold=1.0, suppress_silence=False, word_timestamps=True # though unused, timestamps reduce hallucination ).merge_all_segments() adjustment = clean_transcripts( result.text .replace(keyword, " ") .replace(header, " ") ) mer=compute_mer(transcription, adjustment) adjusted[split][sample[id]] = adjustment trange.set_postfix(mer=f"{mer:.2f}", dc=len(double_check[split])) if mer > 30: double_check[split][sample[id]] = mer print(transcription, "||", adjustment) if i % 5 == 0 or i == len(unks[split]) - 1: with open(f"checkpoint_{split}.json", "w") as f: json.dump(adjusted[split], f)
-
替换包含UNK的样本: python from datasets import DatasetDict import json
adjusted_transcripts = {} for split in data_raw: with open(f"checkpoint_{split}.json", "r", encoding="utf8") as f: adjusted_transcripts[split] = json.load(f)
UNK = "[UNK]"
def fix_unk(sample, adjusted_dict): def bad(orig, new): return sacretok(new) in sacretok(orig)
transcription = clean_transcripts(sample[transcription].replace(UNK, "")) sid = sample[id] adjustment = adjusted_dict.get(sid, transcription) if bad(transcription, adjustment): # adjustment worse than just removing UNK # print("skipped:", transcription, "||", adjustment) adjustment = transcription return {"transcription": adjustment}data = DatasetDict({ split: data_raw[split].map(lambda x: fix_unk(x, adjusted_transcripts[split]), load_from_cache_file=False) for split in data_raw }) data = data.sort(["session_id","id"], load_from_cache_file=False)
for split in data: for line in data[split][transcription]: assert UNK not in line
train adjusted 402 samples, 75 of which just removes UNKs. test adjusted 36 samples, 9 of which just removes UNKs. validation adjusted 63 samples, 7 of which just removes UNKs.



