nccratliri/vad-zebra-finch
收藏斑马雀语音数据集
数据集概述
该数据集是为WhisperSeg系统定制的斑马雀动物语音活动检测(语音分割)数据集。WhisperSeg利用预训练的Whisper Transformer进行自动语音识别(ASR),用于人类和动物的语音活动检测(VAD)。
数据集下载
python from huggingface_hub import snapshot_download snapshot_download(nccratliri/vad-zebra-finch, local_dir = "data/zebra-finch", repo_type="dataset")
引用信息
在使用此数据集时,请引用以下文献:
@article {Tomka2023.09.04.555475, author = {Tomas Tomka and Xinyu Hao and Aoxue Miao and Kanghwi Lee and Maris Basha and Stefan Reimann and Anja T Zai and Richard Hahnloser}, title = {Benchmarking nearest neighbor retrieval of zebra finch vocalizations across development}, elocation-id = {2023.09.04.555475}, year = {2023}, doi = {10.1101/2023.09.04.555475}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Vocalizations are highly specialized motor gestures that regulate social interactions. The reliable detection of vocalizations from raw streams of microphone data remains an open problem even in research on widely studied animals such as the zebra finch. A promising method for finding vocal samples from potentially few labelled examples(templates) is nearest neighbor retrieval, but this method has never been extensively tested on vocal segmentation tasks. We retrieve zebra finch vocalizations as neighbors of each other in the sound spectrogram space. Based on merely 50 templates, we find excellent retrieval performance in adults (F1 score of 0.93 +/- 0.07) but not in juveniles (F1 score of 0.64 +/- 0.18), presumably due to the larger vocal variability of the latter. The performance in juveniles improves when retrieval is based on fixed-size template slices (F1 score of 0.72 +/- 0.10) instead of entire templates. Among the several distance metrics we tested such as the cosine and the Euclidean distance, we find that the Spearman distance largely outperforms all others. We release our expert-curated dataset of more than 50’000 zebra finch vocal segments, which will enable training of data-hungry machine-learning approaches.Competing Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2023/09/04/2023.09.04.555475}, eprint = {https://www.biorxiv.org/content/early/2023/09/04/2023.09.04.555475.full.pdf}, journal = {bioRxiv} }
@article {Gu2023.09.30.560270, author = {Nianlong Gu and Kanghwi Lee and Maris Basha and Sumit Kumar Ram and Guanghao You and Richard Hahnloser}, title = {Positive Transfer of the Whisper Speech Transformer to Human and Animal Voice Activity Detection}, elocation-id = {2023.09.30.560270}, year = {2023}, doi = {10.1101/2023.09.30.560270}, publisher = {Cold Spring Harbor Laboratory}, abstract = {This paper introduces WhisperSeg, utilizing the Whisper Transformer pre-trained for Automatic Speech Recognition (ASR) for human and animal Voice Activity Detection (VAD). Contrary to traditional methods that detect human voice or animal vocalizations from a short audio frame and rely on careful threshold selection, WhisperSeg processes entire spectrograms of long audio and generates plain text representations of onset, offset, and type of voice activity. Processing a longer audio context with a larger network greatly improves detection accuracy from few labeled examples. We further demonstrate a positive transfer of detection performance to new animal species, making our approach viable in the data-scarce multi-species setting.Competing Interest StatementThe authors have declared no competing interest.}, URL = {https://www.biorxiv.org/content/early/2023/10/02/2023.09.30.560270}, eprint = {https://www.biorxiv.org/content/early/2023/10/02/2023.09.30.560270.full.pdf}, journal = {bioRxiv} }




