five

whooray/MIRACLRetrieval

收藏
Hugging Face2026-03-02 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/whooray/MIRACLRetrieval
下载链接
链接失效反馈
官方服务:
资源简介:
--- annotations_creators: - expert-annotated language: - ara - ben - deu - eng - fas - fin - fra - hin - ind - jpn - kor - rus - spa - swa - tel - tha - yor - zho license: cc-by-sa-4.0 multilinguality: multilingual source_datasets: - RSamoed/MIRACLRetrieval task_categories: - text-retrieval task_ids: [] dataset_info: - config_name: ar-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 1217782171 num_examples: 2061414 download_size: 547359941 dataset_size: 1217782171 - config_name: ar-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 835497 num_examples: 29197 download_size: 297571 dataset_size: 835497 - config_name: ar-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 192068 num_examples: 2896 download_size: 107179 dataset_size: 192068 - config_name: bn-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 309278144 num_examples: 297265 download_size: 111041231 dataset_size: 309278144 - config_name: bn-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 113824 num_examples: 4206 download_size: 39642 dataset_size: 113824 - config_name: bn-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 55939 num_examples: 411 download_size: 24670 dataset_size: 55939 - config_name: de-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 7018142949 num_examples: 15866222 download_size: 4154965117 dataset_size: 7018142949 - config_name: de-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 105647 num_examples: 3144 download_size: 36113 dataset_size: 105647 - config_name: de-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 19450 num_examples: 305 download_size: 14370 dataset_size: 19450 - config_name: en-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 13936141152 num_examples: 32893221 download_size: 7975250425 dataset_size: 13936141152 - config_name: en-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 242967 num_examples: 8350 download_size: 90601 dataset_size: 242967 - config_name: en-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 41576 num_examples: 799 download_size: 28786 dataset_size: 41576 - config_name: es-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 4490445463 num_examples: 10373953 download_size: 2570067226 dataset_size: 4490445463 - config_name: es-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 215688 num_examples: 6443 download_size: 71449 dataset_size: 215688 - config_name: es-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 43504 num_examples: 648 download_size: 29334 dataset_size: 43504 - config_name: fa-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 1091928603 num_examples: 2207172 download_size: 474255337 dataset_size: 1091928603 - config_name: fa-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 216792 num_examples: 6571 download_size: 71058 dataset_size: 216792 - config_name: fa-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 58114 num_examples: 632 download_size: 32519 dataset_size: 58114 - config_name: fi-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 739208952 num_examples: 1883509 download_size: 438759882 dataset_size: 739208952 - config_name: fi-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 334260 num_examples: 12008 download_size: 114931 dataset_size: 334260 - config_name: fi-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 65738 num_examples: 1271 download_size: 46897 dataset_size: 65738 - config_name: fr-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 5507181331 num_examples: 14636953 download_size: 3090791707 dataset_size: 5507181331 - config_name: fr-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 115836 num_examples: 3429 download_size: 38727 dataset_size: 115836 - config_name: fr-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 21433 num_examples: 343 download_size: 14680 dataset_size: 21433 - config_name: hi-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 490803869 num_examples: 506264 download_size: 175786968 dataset_size: 490803869 - config_name: hi-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 111217 num_examples: 3494 download_size: 36608 dataset_size: 111217 - config_name: hi-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 54122 num_examples: 350 download_size: 25503 dataset_size: 54122 - config_name: id-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 537334548 num_examples: 1446315 download_size: 271115071 dataset_size: 537334548 - config_name: id-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 263958 num_examples: 9668 download_size: 90280 dataset_size: 263958 - config_name: id-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 47803 num_examples: 960 download_size: 30473 dataset_size: 47803 - config_name: ja-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 2970531665 num_examples: 6953614 download_size: 1667392675 dataset_size: 2970531665 - config_name: ja-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 236068 num_examples: 8354 download_size: 83800 dataset_size: 236068 - config_name: ja-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 55179 num_examples: 860 download_size: 33901 dataset_size: 55179 - config_name: ko-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 633206834 num_examples: 1486752 download_size: 361325767 dataset_size: 633206834 - config_name: ko-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 83188 num_examples: 3057 download_size: 30628 dataset_size: 83188 - config_name: ko-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 13875 num_examples: 213 download_size: 9844 dataset_size: 13875 - config_name: ru-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 5921641619 num_examples: 9543918 download_size: 2767862757 dataset_size: 5921641619 - config_name: ru-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 373821 num_examples: 13100 download_size: 134522 dataset_size: 373821 - config_name: ru-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 116037 num_examples: 1252 download_size: 67086 dataset_size: 116037 - config_name: sw-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 32766816 num_examples: 131924 download_size: 16589955 dataset_size: 32766816 - config_name: sw-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 135409 num_examples: 5092 download_size: 44041 dataset_size: 135409 - config_name: sw-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 24357 num_examples: 482 download_size: 15546 dataset_size: 24357 - config_name: te-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 549992531 num_examples: 518079 download_size: 149672222 dataset_size: 549992531 - config_name: te-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 43268 num_examples: 1606 download_size: 19278 dataset_size: 43268 - config_name: te-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 95102 num_examples: 828 download_size: 38384 dataset_size: 95102 - config_name: th-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 549881577 num_examples: 542166 download_size: 201796559 dataset_size: 549881577 - config_name: th-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 208002 num_examples: 7573 download_size: 71074 dataset_size: 208002 - config_name: th-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 98697 num_examples: 733 download_size: 44774 dataset_size: 98697 - config_name: yo-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 10639617 num_examples: 49043 download_size: 5157335 dataset_size: 10639617 - config_name: yo-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 35512 num_examples: 1188 download_size: 10613 dataset_size: 35512 - config_name: yo-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 6697 num_examples: 119 download_size: 5615 dataset_size: 6697 - config_name: zh-corpus features: - name: _id dtype: string - name: text dtype: string - name: title dtype: string splits: - name: dev num_bytes: 1718428423 num_examples: 4934368 download_size: 1092221644 dataset_size: 1718428423 - config_name: zh-qrels features: - name: query-id dtype: string - name: corpus-id dtype: string - name: score dtype: int64 splits: - name: dev num_bytes: 129492 num_examples: 3928 download_size: 42650 dataset_size: 129492 - config_name: zh-queries features: - name: _id dtype: string - name: text dtype: string splits: - name: dev num_bytes: 19261 num_examples: 393 download_size: 13803 dataset_size: 19261 configs: - config_name: ar-corpus data_files: - split: dev path: ar-corpus/dev-* - config_name: ar-qrels data_files: - split: dev path: ar-qrels/dev-* - config_name: ar-queries data_files: - split: dev path: ar-queries/dev-* - config_name: bn-corpus data_files: - split: dev path: bn-corpus/dev-* - config_name: bn-qrels data_files: - split: dev path: bn-qrels/dev-* - config_name: bn-queries data_files: - split: dev path: bn-queries/dev-* - config_name: de-corpus data_files: - split: dev path: de-corpus/dev-* - config_name: de-qrels data_files: - split: dev path: de-qrels/dev-* - config_name: de-queries data_files: - split: dev path: de-queries/dev-* - config_name: en-corpus data_files: - split: dev path: en-corpus/dev-* - config_name: en-qrels data_files: - split: dev path: en-qrels/dev-* - config_name: en-queries data_files: - split: dev path: en-queries/dev-* - config_name: es-corpus data_files: - split: dev path: es-corpus/dev-* - config_name: es-qrels data_files: - split: dev path: es-qrels/dev-* - config_name: es-queries data_files: - split: dev path: es-queries/dev-* - config_name: fa-corpus data_files: - split: dev path: fa-corpus/dev-* - config_name: fa-qrels data_files: - split: dev path: fa-qrels/dev-* - config_name: fa-queries data_files: - split: dev path: fa-queries/dev-* - config_name: fi-corpus data_files: - split: dev path: fi-corpus/dev-* - config_name: fi-qrels data_files: - split: dev path: fi-qrels/dev-* - config_name: fi-queries data_files: - split: dev path: fi-queries/dev-* - config_name: fr-corpus data_files: - split: dev path: fr-corpus/dev-* - config_name: fr-qrels data_files: - split: dev path: fr-qrels/dev-* - config_name: fr-queries data_files: - split: dev path: fr-queries/dev-* - config_name: hi-corpus data_files: - split: dev path: hi-corpus/dev-* - config_name: hi-qrels data_files: - split: dev path: hi-qrels/dev-* - config_name: hi-queries data_files: - split: dev path: hi-queries/dev-* - config_name: id-corpus data_files: - split: dev path: id-corpus/dev-* - config_name: id-qrels data_files: - split: dev path: id-qrels/dev-* - config_name: id-queries data_files: - split: dev path: id-queries/dev-* - config_name: ja-corpus data_files: - split: dev path: ja-corpus/dev-* - config_name: ja-qrels data_files: - split: dev path: ja-qrels/dev-* - config_name: ja-queries data_files: - split: dev path: ja-queries/dev-* - config_name: ko-corpus data_files: - split: dev path: ko-corpus/dev-* - config_name: ko-qrels data_files: - split: dev path: ko-qrels/dev-* - config_name: ko-queries data_files: - split: dev path: ko-queries/dev-* - config_name: ru-corpus data_files: - split: dev path: ru-corpus/dev-* - config_name: ru-qrels data_files: - split: dev path: ru-qrels/dev-* - config_name: ru-queries data_files: - split: dev path: ru-queries/dev-* - config_name: sw-corpus data_files: - split: dev path: sw-corpus/dev-* - config_name: sw-qrels data_files: - split: dev path: sw-qrels/dev-* - config_name: sw-queries data_files: - split: dev path: sw-queries/dev-* - config_name: te-corpus data_files: - split: dev path: te-corpus/dev-* - config_name: te-qrels data_files: - split: dev path: te-qrels/dev-* - config_name: te-queries data_files: - split: dev path: te-queries/dev-* - config_name: th-corpus data_files: - split: dev path: th-corpus/dev-* - config_name: th-qrels data_files: - split: dev path: th-qrels/dev-* - config_name: th-queries data_files: - split: dev path: th-queries/dev-* - config_name: yo-corpus data_files: - split: dev path: yo-corpus/dev-* - config_name: yo-qrels data_files: - split: dev path: yo-qrels/dev-* - config_name: yo-queries data_files: - split: dev path: yo-queries/dev-* - config_name: zh-corpus data_files: - split: dev path: zh-corpus/dev-* - config_name: zh-qrels data_files: - split: dev path: zh-qrels/dev-* - config_name: zh-queries data_files: - split: dev path: zh-queries/dev-* tags: - mteb - text --- <!-- adapted from https://github.com/huggingface/huggingface_hub/blob/v0.30.2/src/huggingface_hub/templates/datasetcard_template.md --> <div align="center" style="padding: 40px 20px; background-color: white; border-radius: 12px; box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05); max-width: 600px; margin: 0 auto;"> <h1 style="font-size: 3.5rem; color: #1a1a1a; margin: 0 0 20px 0; letter-spacing: 2px; font-weight: 700;">MIRACLRetrieval</h1> <div style="font-size: 1.5rem; color: #4a4a4a; margin-bottom: 5px; font-weight: 300;">An <a href="https://github.com/embeddings-benchmark/mteb" style="color: #2c5282; font-weight: 600; text-decoration: none;" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">MTEB</a> dataset</div> <div style="font-size: 0.9rem; color: #2c5282; margin-top: 10px;">Massive Text Embedding Benchmark</div> </div> MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval dataset that focuses on search across 18 different languages. | | | |---------------|---------------------------------------------| | Task category | t2t | | Domains | Encyclopaedic, Written | | Reference | http://miracl.ai/ | ## How to evaluate on this task You can evaluate an embedding model on this dataset using the following code: ```python import mteb task = mteb.get_tasks(["MIRACLRetrieval"]) evaluator = mteb.MTEB(task) model = mteb.get_model(YOUR_MODEL) evaluator.run(model) ``` <!-- Datasets want link to arxiv in readme to autolink dataset with paper --> To learn more about how to run models on `mteb` task check out the [GitHub repitory](https://github.com/embeddings-benchmark/mteb). ## Citation If you use this dataset, please cite the dataset as well as [mteb](https://github.com/embeddings-benchmark/mteb), as this dataset likely includes additional processing as a part of the [MMTEB Contribution](https://github.com/embeddings-benchmark/mteb/tree/main/docs/mmteb). ```bibtex @article{10.1162/tacl_a_00595, abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}}, author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, doi = {10.1162/tacl_a_00595}, eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, issn = {2307-387X}, journal = {Transactions of the Association for Computational Linguistics}, month = {09}, pages = {1114-1131}, title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}}, url = {https://doi.org/10.1162/tacl\_a\_00595}, volume = {11}, year = {2023}, } @article{enevoldsen2025mmtebmassivemultilingualtext, title={MMTEB: Massive Multilingual Text Embedding Benchmark}, author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, publisher = {arXiv}, journal={arXiv preprint arXiv:2502.13595}, year={2025}, url={https://arxiv.org/abs/2502.13595}, doi = {10.48550/arXiv.2502.13595}, } @article{muennighoff2022mteb, author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils}, title = {MTEB: Massive Text Embedding Benchmark}, publisher = {arXiv}, journal={arXiv preprint arXiv:2210.07316}, year = {2022} url = {https://arxiv.org/abs/2210.07316}, doi = {10.48550/ARXIV.2210.07316}, } ``` # Dataset Statistics <details> <summary> Dataset Statistics</summary> The following code contains the descriptive statistics from the task. These can also be obtained using: ```python import mteb task = mteb.get_task("MIRACLRetrieval") desc_stats = task.metadata.descriptive_stats ``` ```json { "dev": { "num_samples": 106345647, "number_of_characters": 37176781172, "num_documents": 106332152, "min_document_length": 2, "average_document_length": 349.6241542163089, "max_document_length": 84930, "unique_documents": 106332152, "num_queries": 13495, "min_query_length": 5, "average_query_length": 36.49225639125602, "max_query_length": 176, "unique_queries": 13495, "none_queries": 0, "num_relevant_docs": 130408, "min_relevant_docs_per_query": 1, "average_relevant_docs_per_query": 2.3059651722860317, "max_relevant_docs_per_query": 20, "unique_relevant_docs": 119924, "num_instructions": null, "min_instruction_length": null, "average_instruction_length": null, "max_instruction_length": null, "unique_instructions": null, "num_top_ranked": null, "min_top_ranked_per_query": null, "average_top_ranked_per_query": null, "max_top_ranked_per_query": null } } ``` </details> --- *This dataset card was automatically generated using [MTEB](https://github.com/embeddings-benchmark/mteb)*
提供机构:
whooray
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作