five

DataProvenanceInitiative/Megawika_subset

收藏
Hugging Face2024-11-19 更新2025-04-12 收录
下载链接:
https://hf-mirror.com/datasets/DataProvenanceInitiative/Megawika_subset
下载链接
链接失效反馈
官方服务:
资源简介:
Quick script to get and upload the first file from each folder in Megawika. ```python import os from huggingface_hub import HfApi from huggingface_hub import hf_hub_download filenames =[ "af/af-00000-of-00005.jsonl", "ar/ar-00000-of-02834.jsonl", "az/az-00000-of-00111.jsonl", "bn/bn-00000-of-00306.jsonl", "cs/cs-00000-of-00318.jsonl", "de/de-00000-of-04503.jsonl", "en/en-00000-of-06154.jsonl", "es/es-00000-of-03727.jsonl", "et/et-00000-of-00251.jsonl", "fa/fa-00000-of-00731.jsonl", "fi/fi-00000-of-01139.jsonl", "fr/fr-00000-of-01258.jsonl", "ga/ga-00000-of-00008.jsonl", "gl/gl-00000-of-00289.jsonl", "gu/gu-00000-of-00035.jsonl", "he/he-00000-of-00180.jsonl", "hi/hi-00000-of-00230.jsonl", "hr/hr-00000-of-00186.jsonl", "id/id-00000-of-01062.jsonl", "it/it-00000-of-02379.jsonl", "ja/ja-00000-of-01318.jsonl", "ka/ka-00000-of-00101.jsonl", "kk/kk-00000-of-00153.jsonl", "km/km-00000-of-00008.jsonl", "ko/ko-00000-of-00289.jsonl", "lt/lt-00000-of-00246.jsonl", "lv/lv-00000-of-00072.jsonl", "mk/mk-00000-of-00107.jsonl", "ml/ml-00000-of-00188.jsonl", "mn/mn-00000-of-00010.jsonl", "mr/mr-00000-of-00043.jsonl", "my/my-00000-of-00235.jsonl", "ne/ne-00000-of-00043.jsonl", "nl/nl-00000-of-02136.jsonl", "pl/pl-00000-of-01017.jsonl", "ps/ps-00000-of-00006.jsonl", "pt/pt-00000-of-02681.jsonl", "ro/ro-00000-of-00545.jsonl", "ru/ru-00000-of-02267.jsonl", "si/si-00000-of-00009.jsonl", "sl/sl-00000-of-00077.jsonl", "sv/sv-00000-of-06008.jsonl", "ta/ta-00000-of-00325.jsonl", "th/th-00000-of-00287.jsonl", "tr/tr-00000-of-01119.jsonl", "uk/uk-00000-of-00970.jsonl", "ur/ur-00000-of-00228.jsonl", "vi/vi-00000-of-03660.jsonl", "xh/xh-00000-of-00001.jsonl", "zh/zh-00000-of-00525.jsonl", ] os.makedirs("megawika_jsons", exist_ok=True) for filename in filenames: hf_hub_download( repo_id="hltcoe/megawika", filename=f"data/{filename}", repo_type="dataset", local_dir="megawika_jsons" ) api = HfApi() repo_id = "DataProvenanceInitiative/Megawika_subset" token = "YOUR_TOKEN" try: api.create_repo( repo_id=repo_id, repo_type="dataset", token=token ) except: pass api.upload_folder( folder_path="./megawika_jsons/data", repo_id=repo_id, repo_type="dataset", token=token ) ```

本快速脚本用于获取并上传Megawika数据集各子文件夹内的首个文件。 python import os from huggingface_hub import HfApi from huggingface_hub import hf_hub_download filenames =[ "af/af-00000-of-00005.jsonl", "ar/ar-00000-of-02834.jsonl", "az/az-00000-of-00111.jsonl", "bn/bn-00000-of-00306.jsonl", "cs/cs-00000-of-00318.jsonl", "de/de-00000-of-04503.jsonl", "en/en-00000-of-06154.jsonl", "es/es-00000-of-03727.jsonl", "et/et-00000-of-00251.jsonl", "fa/fa-00000-of-00731.jsonl", "fi/fi-00000-of-01139.jsonl", "fr/fr-00000-of-01258.jsonl", "ga/ga-00000-of-00008.jsonl", "gl/gl-00000-of-00289.jsonl", "gu/gu-00000-of-00035.jsonl", "he/he-00000-of-00180.jsonl", "hi/hi-00000-of-00230.jsonl", "hr/hr-00000-of-00186.jsonl", "id/id-00000-of-01062.jsonl", "it/it-00000-of-02379.jsonl", "ja/ja-00000-of-01318.jsonl", "ka/ka-00000-of-00101.jsonl", "kk/kk-00000-of-00153.jsonl", "km/km-00000-of-00008.jsonl", "ko/ko-00000-of-00289.jsonl", "lt/lt-00000-of-00246.jsonl", "lv/lv-00000-of-00072.jsonl", "mk/mk-00000-of-00107.jsonl", "ml/ml-00000-of-00188.jsonl", "mn/mn-00000-of-00010.jsonl", "mr/mr-00000-of-00043.jsonl", "my/my-00000-of-00235.jsonl", "ne/ne-00000-of-00043.jsonl", "nl/nl-00000-of-02136.jsonl", "pl/pl-00000-of-01017.jsonl", "ps/ps-00000-of-00006.jsonl", "pt/pt-00000-of-02681.jsonl", "ro/ro-00000-of-00545.jsonl", "ru/ru-00000-of-02267.jsonl", "si/si-00000-of-00009.jsonl", "sl/sl-00000-of-00077.jsonl", "sv/sv-00000-of-06008.jsonl", "ta/ta-00000-of-00325.jsonl", "th/th-00000-of-00287.jsonl", "tr/tr-00000-of-01119.jsonl", "uk/uk-00000-of-00970.jsonl", "ur/ur-00000-of-00228.jsonl", "vi/vi-00000-of-03660.jsonl", "xh/xh-00000-of-00001.jsonl", "zh/zh-00000-of-00525.jsonl", ] os.makedirs("megawika_jsons", exist_ok=True) for filename in filenames: hf_hub_download( repo_id="hltcoe/megawika", filename=f"data/{filename}", repo_type="dataset", local_dir="megawika_jsons" ) api = HfApi() repo_id = "DataProvenanceInitiative/Megawika_subset" token = "YOUR_TOKEN" try: api.create_repo( repo_id=repo_id, repo_type="dataset", token=token ) except: pass api.upload_folder( folder_path="./megawika_jsons/data", repo_id=repo_id, repo_type="dataset", token=token )
提供机构:
DataProvenanceInitiative
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作