SEACrowd/cc100
收藏数据集概述
语言
- ind
- jav
- sun
- mya
- mya_zaw
- lao
- khm
- tgl
- vie
- tha
- zlm
任务类别
- 自监督预训练
标签
- 自监督预训练
数据集描述
该语料库旨在重现用于训练XLM-R的数据集。该语料库包含100多种语言的单语数据,还包括罗马化语言的数据(以*_rom表示)。该语料库是通过处理2018年1月至12月的Commoncrawl快照,使用CC-NET仓库提供的URL和段落索引构建的。每个文件包含由双换行符分隔的文档和由单换行符分隔的同一文档内的段落。数据生成使用了开源的CC-NET仓库。
数据集版本
- 源版本: 2018.12.01
- SEACrowd版本: 2024.06.20
数据集许可证
- MIT
引用
@inproceedings{conneau-etal-2020-unsupervised, title = "Unsupervised Cross-lingual Representation Learning at Scale", author = "Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.747", doi = "10.18653/v1/2020.acl-main.747", pages = "8440--8451", abstract = "This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6{%} average accuracy on XNLI, +13{%} average F1 score on MLQA, and +2.4{%} F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7{%} in XNLI accuracy for Swahili and 11.4{%} for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.", }
@inproceedings{wenzek-etal-2020-ccnet, title = "{CCN}et: Extracting High Quality Monolingual Datasets from Web Crawl Data", author = "Wenzek, Guillaume and Lachaux, Marie-Anne and Conneau, Alexis and Chaudhary, Vishrav and Guzm{a}n, Francisco and Joulin, Armand and Grave, Edouard", booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference", month = may, year = "2020", address = "Marseille, France", publisher = "European Language Resources Association", url = "https://www.aclweb.org/anthology/2020.lrec-1.494", pages = "4003--4012", abstract = "Pre-training text representations have led to significant improvements in many areas of natural language processing. The quality of these models benefits greatly from the size of the pretraining corpora as long as its quality is preserved. In this paper, we describe an automatic pipeline to extract massive high-quality monolingual datasets from Common Crawl for a variety of languages. Our pipeline follows the data processing introduced in fastText (Mikolov et al., 2017; Grave et al., 2018), that deduplicates documents and identifies their language. We augment this pipeline with a filtering step to select documents that are close to high quality corpora like Wikipedia.", language = "English", ISBN = "979-10-95546-34-4", }
@article{lovenia2024seacrowd, title={SEACrowd: A Multilingual Multimodal Data Hub and Benchmark Suite for Southeast Asian Languages}, author={Holy Lovenia and Rahmad Mahendra and Salsabil Maulana Akbar and Lester James V. Miranda and Jennifer Santoso and Elyanah Aco and Akhdan Fadhilah and Jonibek Mansurov and Joseph Marvin Imperial and Onno P. Kampman and Joel Ruben Antony Moniz and Muhammad Ravi Shulthan Habibi and Frederikus Hudi and Railey Montalan and Ryan Ignatius and Joanito Agili Lopo and William Nixon and Börje F. Karlsson and James Jaya and Ryandito Diandaru and Yuze Gao and Patrick Amadeus and Bin Wang and Jan Christian Blaise Cruz and Chenxi Whitehouse and Ivan Halim Parmonangan and Maria Khelli and Wenyu Zhang and Lucky Susanto and Reynard Adha Ryanda and Sonny Lazuardi Hermawan and Dan John Velasco and Muhammad Dehan Al Kautsar and Willy Fitra Hendria and Yasmin Moslem and Noah Flynn and Muhammad Farid Adilazuarda and Haochen Li and Johanes Lee and R. Damanhuri and Shuo Sun and Muhammad Reza Qorib and Amirbek Djanibekov and Wei Qi Leong and Quyet V. Do and Niklas Muennighoff and Tanrada Pansuwan and Ilham Firdausi Putra and Yan Xu and Ngee Chia Tai and Ayu Purwarianti and Sebastian Ruder and William Tjhi and Peerat Limkonchotiwat and Alham Fikri Aji and Sedrick Keh and Genta Indra Winata and Ruochen Zhang and Fajri Koto and Zheng-Xin Yong and Samuel Cahyawijaya}, year={2024}, eprint={2406.10118}, journal={arXiv preprint arXiv: 2406.10118} }




