nccratliri/vad-human-ava-speech
收藏数据集概述
数据集名称
AVA-Speech
数据集描述
AVA-Speech数据集是为WhisperSeg中的人类语音活动检测定制的。音频文件从电影中提取,起始和结束时间点在话语级别。
数据集下载
python from huggingface_hub import snapshot_download snapshot_download(nccratliri/vad-human-ava-speech, local_dir = "data/human-ava-speech", repo_type="dataset" )
引用信息
@article {Gu2023.09.30.560270, author = {Nianlong Gu and Kanghwi Lee and Maris Basha and Sumit Kumar Ram and Guanghao You and Richard Hahnloser}, title = {Positive Transfer of the Whisper Speech Transformer to Human and Animal Voice Activity Detection}, elocation-id = {2023.09.30.560270}, year = {2023}, doi = {10.1101/2023.09.30.560270}, publisher = {Cold Spring Harbor Laboratory}, abstract = {This paper introduces WhisperSeg, utilizing the Whisper Transformer pre-trained for Automatic Speech Recognition (ASR) for human and animal Voice Activity Detection (VAD). Contrary to traditional methods that detect human voice or animal vocalizations from a short audio frame and rely on careful threshold selection, WhisperSeg processes entire spectrograms of long audio and generates plain text representations of onset, offset, and type of voice activity. Processing a longer audio context with a larger network greatly improves detection accuracy from few labeled examples. We further demonstrate a positive transfer of detection performance to new animal species, making our approach viable in the data-scarce multi-species setting.Competing Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2023/10/02/2023.09.30.560270}, eprint = {https://www.biorxiv.org/content/early/2023/10/02/2023.09.30.560270.full.pdf}, journal = {bioRxiv} }
@inproceedings{ava-speech, title = {AVA-Speech: A Densely Labeled Dataset of Speech Activity in Movies}, author = {Sourish Chaudhuri and Joseph Roth and Dan Ellis and Andrew C. Gallagher and Liat Kaver and Radhika Marvin and Caroline Pantofaru and Nathan Christopher Reale and Loretta Guarino Reid and Kevin Wilson and Zhonghua Xi}, year = {2018}, URL = {https://arxiv.org/pdf/1808.00606}, booktitle = {Proceedings of Interspeech, 2018} }



