cis-lmu/Glot500
收藏Hugging Face2025-12-10 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/cis-lmu/Glot500
下载链接
链接失效反馈官方服务:
资源简介:
---
license: other
license_name: license
license_link: LICENSE
configs:
- config_name: knv_Latn
data_files:
- split: train
path: "knv_Latn/train/*.arrow"
- config_name: tgk_Latn
data_files:
- split: train
path: "tgk_Latn/train/*.arrow"
- config_name: ton_Latn
data_files:
- split: train
path: "ton_Latn/train/*.arrow"
- config_name: nld_Latn
data_files:
- split: train
path: "nld_Latn/train/*.arrow"
- config_name: tzo_Latn
data_files:
- split: train
path: "tzo_Latn/train/*.arrow"
- config_name: cuk_Latn
data_files:
- split: train
path: "cuk_Latn/train/*.arrow"
- config_name: fil_Latn
data_files:
- split: train
path: "fil_Latn/train/*.arrow"
- config_name: hau_Arab
data_files:
- split: train
path: "hau_Arab/train/*.arrow"
- config_name: uzb_Cyrl
data_files:
- split: train
path: "uzb_Cyrl/train/*.arrow"
- config_name: jav_Latn
data_files:
- split: train
path: "jav_Latn/train/*.arrow"
- config_name: rap_Latn
data_files:
- split: train
path: "rap_Latn/train/*.arrow"
- config_name: bak_Cyrl
data_files:
- split: train
path: "bak_Cyrl/train/*.arrow"
- config_name: por_Latn
data_files:
- split: train
path: "por_Latn/train/*.arrow"
- config_name: hbo_Hebr
data_files:
- split: train
path: "hbo_Hebr/train/*.arrow"
- config_name: quy_Latn
data_files:
- split: train
path: "quy_Latn/train/*.arrow"
- config_name: hnj_Latn
data_files:
- split: train
path: "hnj_Latn/train/*.arrow"
- config_name: ast_Latn
data_files:
- split: train
path: "ast_Latn/train/*.arrow"
- config_name: cos_Latn
data_files:
- split: train
path: "cos_Latn/train/*.arrow"
- config_name: fon_Latn
data_files:
- split: train
path: "fon_Latn/train/*.arrow"
- config_name: sna_Latn
data_files:
- split: train
path: "sna_Latn/train/*.arrow"
- config_name: dzo_Tibt
data_files:
- split: train
path: "dzo_Tibt/train/*.arrow"
- config_name: nob_Latn
data_files:
- split: train
path: "nob_Latn/train/*.arrow"
- config_name: nch_Latn
data_files:
- split: train
path: "nch_Latn/train/*.arrow"
- config_name: che_Cyrl
data_files:
- split: train
path: "che_Cyrl/train/*.arrow"
- config_name: ext_Latn
data_files:
- split: train
path: "ext_Latn/train/*.arrow"
- config_name: dtp_Latn
data_files:
- split: train
path: "dtp_Latn/train/*.arrow"
- config_name: yue_Hani
data_files:
- split: train
path: "yue_Hani/train/*.arrow"
- config_name: kbd_Cyrl
data_files:
- split: train
path: "kbd_Cyrl/train/*.arrow"
- config_name: mar_Deva
data_files:
- split: train
path: "mar_Deva/train/*.arrow"
- config_name: ron_Latn
data_files:
- split: train
path: "ron_Latn/train/*.arrow"
- config_name: acr_Latn
data_files:
- split: train
path: "acr_Latn/train/*.arrow"
- config_name: afb_Arab
data_files:
- split: train
path: "afb_Arab/train/*.arrow"
- config_name: sqi_Latn
data_files:
- split: train
path: "sqi_Latn/train/*.arrow"
- config_name: eng_Latn
data_files:
- split: train
path: "eng_Latn/train/*.arrow"
- config_name: ksd_Latn
data_files:
- split: train
path: "ksd_Latn/train/*.arrow"
- config_name: bcl_Latn
data_files:
- split: train
path: "bcl_Latn/train/*.arrow"
- config_name: ksh_Latn
data_files:
- split: train
path: "ksh_Latn/train/*.arrow"
- config_name: hin_Latn
data_files:
- split: train
path: "hin_Latn/train/*.arrow"
- config_name: myv_Cyrl
data_files:
- split: train
path: "myv_Cyrl/train/*.arrow"
- config_name: kjh_Cyrl
data_files:
- split: train
path: "kjh_Cyrl/train/*.arrow"
- config_name: sah_Cyrl
data_files:
- split: train
path: "sah_Cyrl/train/*.arrow"
- config_name: naq_Latn
data_files:
- split: train
path: "naq_Latn/train/*.arrow"
- config_name: tdt_Latn
data_files:
- split: train
path: "tdt_Latn/train/*.arrow"
- config_name: kac_Latn
data_files:
- split: train
path: "kac_Latn/train/*.arrow"
- config_name: cak_Latn
data_files:
- split: train
path: "cak_Latn/train/*.arrow"
- config_name: kir_Cyrl
data_files:
- split: train
path: "kir_Cyrl/train/*.arrow"
- config_name: mps_Latn
data_files:
- split: train
path: "mps_Latn/train/*.arrow"
- config_name: yid_Hebr
data_files:
- split: train
path: "yid_Hebr/train/*.arrow"
- config_name: srn_Latn
data_files:
- split: train
path: "srn_Latn/train/*.arrow"
- config_name: div_Thaa
data_files:
- split: train
path: "div_Thaa/train/*.arrow"
- config_name: mkd_Cyrl
data_files:
- split: train
path: "mkd_Cyrl/train/*.arrow"
- config_name: bre_Latn
data_files:
- split: train
path: "bre_Latn/train/*.arrow"
- config_name: tvl_Latn
data_files:
- split: train
path: "tvl_Latn/train/*.arrow"
- config_name: ven_Latn
data_files:
- split: train
path: "ven_Latn/train/*.arrow"
- config_name: wuu_Hani
data_files:
- split: train
path: "wuu_Hani/train/*.arrow"
- config_name: mwl_Latn
data_files:
- split: train
path: "mwl_Latn/train/*.arrow"
- config_name: miq_Latn
data_files:
- split: train
path: "miq_Latn/train/*.arrow"
- config_name: slv_Latn
data_files:
- split: train
path: "slv_Latn/train/*.arrow"
- config_name: hrv_Latn
data_files:
- split: train
path: "hrv_Latn/train/*.arrow"
- config_name: hmo_Latn
data_files:
- split: train
path: "hmo_Latn/train/*.arrow"
- config_name: som_Latn
data_files:
- split: train
path: "som_Latn/train/*.arrow"
- config_name: bod_Tibt
data_files:
- split: train
path: "bod_Tibt/train/*.arrow"
- config_name: pls_Latn
data_files:
- split: train
path: "pls_Latn/train/*.arrow"
- config_name: ile_Latn
data_files:
- split: train
path: "ile_Latn/train/*.arrow"
- config_name: luo_Latn
data_files:
- split: train
path: "luo_Latn/train/*.arrow"
- config_name: pus_Arab
data_files:
- split: train
path: "pus_Arab/train/*.arrow"
- config_name: fao_Latn
data_files:
- split: train
path: "fao_Latn/train/*.arrow"
- config_name: ces_Latn
data_files:
- split: train
path: "ces_Latn/train/*.arrow"
- config_name: fas_Arab
data_files:
- split: train
path: "fas_Arab/train/*.arrow"
- config_name: swa_Latn
data_files:
- split: train
path: "swa_Latn/train/*.arrow"
- config_name: ary_Arab
data_files:
- split: train
path: "ary_Arab/train/*.arrow"
- config_name: tbz_Latn
data_files:
- split: train
path: "tbz_Latn/train/*.arrow"
- config_name: hus_Latn
data_files:
- split: train
path: "hus_Latn/train/*.arrow"
- config_name: ote_Latn
data_files:
- split: train
path: "ote_Latn/train/*.arrow"
- config_name: ilo_Latn
data_files:
- split: train
path: "ilo_Latn/train/*.arrow"
- config_name: abk_Cyrl
data_files:
- split: train
path: "abk_Cyrl/train/*.arrow"
- config_name: bqc_Latn
data_files:
- split: train
path: "bqc_Latn/train/*.arrow"
- config_name: hil_Latn
data_files:
- split: train
path: "hil_Latn/train/*.arrow"
- config_name: pon_Latn
data_files:
- split: train
path: "pon_Latn/train/*.arrow"
- config_name: zul_Latn
data_files:
- split: train
path: "zul_Latn/train/*.arrow"
- config_name: als_Latn
data_files:
- split: train
path: "als_Latn/train/*.arrow"
- config_name: pes_Arab
data_files:
- split: train
path: "pes_Arab/train/*.arrow"
- config_name: bpy_Beng
data_files:
- split: train
path: "bpy_Beng/train/*.arrow"
- config_name: bos_Latn
data_files:
- split: train
path: "bos_Latn/train/*.arrow"
- config_name: sot_Latn
data_files:
- split: train
path: "sot_Latn/train/*.arrow"
- config_name: lin_Latn
data_files:
- split: train
path: "lin_Latn/train/*.arrow"
- config_name: tuk_Cyrl
data_files:
- split: train
path: "tuk_Cyrl/train/*.arrow"
- config_name: gla_Latn
data_files:
- split: train
path: "gla_Latn/train/*.arrow"
- config_name: wln_Latn
data_files:
- split: train
path: "wln_Latn/train/*.arrow"
- config_name: apc_Arab
data_files:
- split: train
path: "apc_Arab/train/*.arrow"
- config_name: hin_Deva
data_files:
- split: train
path: "hin_Deva/train/*.arrow"
- config_name: hye_Armn
data_files:
- split: train
path: "hye_Armn/train/*.arrow"
- config_name: tir_Ethi
data_files:
- split: train
path: "tir_Ethi/train/*.arrow"
- config_name: pap_Latn
data_files:
- split: train
path: "pap_Latn/train/*.arrow"
- config_name: gcf_Latn
data_files:
- split: train
path: "gcf_Latn/train/*.arrow"
- config_name: cjk_Latn
data_files:
- split: train
path: "cjk_Latn/train/*.arrow"
- config_name: pcd_Latn
data_files:
- split: train
path: "pcd_Latn/train/*.arrow"
- config_name: tur_Latn
data_files:
- split: train
path: "tur_Latn/train/*.arrow"
- config_name: kon_Latn
data_files:
- split: train
path: "kon_Latn/train/*.arrow"
- config_name: csy_Latn
data_files:
- split: train
path: "csy_Latn/train/*.arrow"
- config_name: bul_Cyrl
data_files:
- split: train
path: "bul_Cyrl/train/*.arrow"
- config_name: xho_Latn
data_files:
- split: train
path: "xho_Latn/train/*.arrow"
- config_name: guc_Latn
data_files:
- split: train
path: "guc_Latn/train/*.arrow"
- config_name: aka_Latn
data_files:
- split: train
path: "aka_Latn/train/*.arrow"
- config_name: kea_Latn
data_files:
- split: train
path: "kea_Latn/train/*.arrow"
- config_name: bar_Latn
data_files:
- split: train
path: "bar_Latn/train/*.arrow"
- config_name: sme_Latn
data_files:
- split: train
path: "sme_Latn/train/*.arrow"
- config_name: csb_Latn
data_files:
- split: train
path: "csb_Latn/train/*.arrow"
- config_name: bak_Latn
data_files:
- split: train
path: "bak_Latn/train/*.arrow"
- config_name: djk_Latn
data_files:
- split: train
path: "djk_Latn/train/*.arrow"
- config_name: xav_Latn
data_files:
- split: train
path: "xav_Latn/train/*.arrow"
- config_name: oci_Latn
data_files:
- split: train
path: "oci_Latn/train/*.arrow"
- config_name: acm_Arab
data_files:
- split: train
path: "acm_Arab/train/*.arrow"
- config_name: rmy_Cyrl
data_files:
- split: train
path: "rmy_Cyrl/train/*.arrow"
- config_name: krc_Cyrl
data_files:
- split: train
path: "krc_Cyrl/train/*.arrow"
- config_name: cym_Latn
data_files:
- split: train
path: "cym_Latn/train/*.arrow"
- config_name: lus_Latn
data_files:
- split: train
path: "lus_Latn/train/*.arrow"
- config_name: ngu_Latn
data_files:
- split: train
path: "ngu_Latn/train/*.arrow"
- config_name: yom_Latn
data_files:
- split: train
path: "yom_Latn/train/*.arrow"
- config_name: tam_Taml
data_files:
- split: train
path: "tam_Taml/train/*.arrow"
- config_name: ajp_Arab
data_files:
- split: train
path: "ajp_Arab/train/*.arrow"
- config_name: epo_Latn
data_files:
- split: train
path: "epo_Latn/train/*.arrow"
- config_name: fra_Latn
data_files:
- split: train
path: "fra_Latn/train/*.arrow"
- config_name: ita_Latn
data_files:
- split: train
path: "ita_Latn/train/*.arrow"
- config_name: seh_Latn
data_files:
- split: train
path: "seh_Latn/train/*.arrow"
- config_name: hbs_Latn
data_files:
- split: train
path: "hbs_Latn/train/*.arrow"
- config_name: uzn_Cyrl
data_files:
- split: train
path: "uzn_Cyrl/train/*.arrow"
- config_name: ksw_Mymr
data_files:
- split: train
path: "ksw_Mymr/train/*.arrow"
- config_name: pms_Latn
data_files:
- split: train
path: "pms_Latn/train/*.arrow"
- config_name: zlm_Latn
data_files:
- split: train
path: "zlm_Latn/train/*.arrow"
- config_name: qub_Latn
data_files:
- split: train
path: "qub_Latn/train/*.arrow"
- config_name: arg_Latn
data_files:
- split: train
path: "arg_Latn/train/*.arrow"
- config_name: enm_Latn
data_files:
- split: train
path: "enm_Latn/train/*.arrow"
- config_name: kaa_Cyrl
data_files:
- split: train
path: "kaa_Cyrl/train/*.arrow"
- config_name: toj_Latn
data_files:
- split: train
path: "toj_Latn/train/*.arrow"
- config_name: spa_Latn
data_files:
- split: train
path: "spa_Latn/train/*.arrow"
- config_name: pol_Latn
data_files:
- split: train
path: "pol_Latn/train/*.arrow"
- config_name: kos_Latn
data_files:
- split: train
path: "kos_Latn/train/*.arrow"
- config_name: kab_Latn
data_files:
- split: train
path: "kab_Latn/train/*.arrow"
- config_name: pan_Guru
data_files:
- split: train
path: "pan_Guru/train/*.arrow"
- config_name: nan_Latn
data_files:
- split: train
path: "nan_Latn/train/*.arrow"
- config_name: aze_Latn
data_files:
- split: train
path: "aze_Latn/train/*.arrow"
- config_name: ara_Arab
data_files:
- split: train
path: "ara_Arab/train/*.arrow"
- config_name: meu_Latn
data_files:
- split: train
path: "meu_Latn/train/*.arrow"
- config_name: som_Arab
data_files:
- split: train
path: "som_Arab/train/*.arrow"
- config_name: lvs_Latn
data_files:
- split: train
path: "lvs_Latn/train/*.arrow"
- config_name: nbl_Latn
data_files:
- split: train
path: "nbl_Latn/train/*.arrow"
- config_name: crh_Latn
data_files:
- split: train
path: "crh_Latn/train/*.arrow"
- config_name: kbp_Latn
data_files:
- split: train
path: "kbp_Latn/train/*.arrow"
- config_name: tgl_Latn
data_files:
- split: train
path: "tgl_Latn/train/*.arrow"
- config_name: kmb_Latn
data_files:
- split: train
path: "kmb_Latn/train/*.arrow"
- config_name: hun_Latn
data_files:
- split: train
path: "hun_Latn/train/*.arrow"
- config_name: yao_Latn
data_files:
- split: train
path: "yao_Latn/train/*.arrow"
- config_name: arn_Latn
data_files:
- split: train
path: "arn_Latn/train/*.arrow"
- config_name: jbo_Latn
data_files:
- split: train
path: "jbo_Latn/train/*.arrow"
- config_name: mzn_Arab
data_files:
- split: train
path: "mzn_Arab/train/*.arrow"
- config_name: lzh_Hani
data_files:
- split: train
path: "lzh_Hani/train/*.arrow"
- config_name: heb_Hebr
data_files:
- split: train
path: "heb_Hebr/train/*.arrow"
- config_name: bjn_Latn
data_files:
- split: train
path: "bjn_Latn/train/*.arrow"
- config_name: gug_Latn
data_files:
- split: train
path: "gug_Latn/train/*.arrow"
- config_name: swc_Latn
data_files:
- split: train
path: "swc_Latn/train/*.arrow"
- config_name: yor_Latn
data_files:
- split: train
path: "yor_Latn/train/*.arrow"
- config_name: ban_Latn
data_files:
- split: train
path: "ban_Latn/train/*.arrow"
- config_name: tlh_Latn
data_files:
- split: train
path: "tlh_Latn/train/*.arrow"
- config_name: chv_Cyrl
data_files:
- split: train
path: "chv_Cyrl/train/*.arrow"
- config_name: sin_Sinh
data_files:
- split: train
path: "sin_Sinh/train/*.arrow"
- config_name: ind_Latn
data_files:
- split: train
path: "ind_Latn/train/*.arrow"
- config_name: amh_Ethi
data_files:
- split: train
path: "amh_Ethi/train/*.arrow"
- config_name: zea_Latn
data_files:
- split: train
path: "zea_Latn/train/*.arrow"
- config_name: kpg_Latn
data_files:
- split: train
path: "kpg_Latn/train/*.arrow"
- config_name: crh_Cyrl
data_files:
- split: train
path: "crh_Cyrl/train/*.arrow"
- config_name: nyu_Latn
data_files:
- split: train
path: "nyu_Latn/train/*.arrow"
- config_name: ibo_Latn
data_files:
- split: train
path: "ibo_Latn/train/*.arrow"
- config_name: msa_Latn
data_files:
- split: train
path: "msa_Latn/train/*.arrow"
- config_name: prs_Arab
data_files:
- split: train
path: "prs_Arab/train/*.arrow"
- config_name: nap_Latn
data_files:
- split: train
path: "nap_Latn/train/*.arrow"
- config_name: bik_Latn
data_files:
- split: train
path: "bik_Latn/train/*.arrow"
- config_name: srp_Cyrl
data_files:
- split: train
path: "srp_Cyrl/train/*.arrow"
- config_name: lao_Laoo
data_files:
- split: train
path: "lao_Laoo/train/*.arrow"
- config_name: kom_Cyrl
data_files:
- split: train
path: "kom_Cyrl/train/*.arrow"
- config_name: nde_Latn
data_files:
- split: train
path: "nde_Latn/train/*.arrow"
- config_name: hui_Latn
data_files:
- split: train
path: "hui_Latn/train/*.arrow"
- config_name: uig_Latn
data_files:
- split: train
path: "uig_Latn/train/*.arrow"
- config_name: new_Deva
data_files:
- split: train
path: "new_Deva/train/*.arrow"
- config_name: kur_Arab
data_files:
- split: train
path: "kur_Arab/train/*.arrow"
- config_name: sco_Latn
data_files:
- split: train
path: "sco_Latn/train/*.arrow"
- config_name: ayr_Latn
data_files:
- split: train
path: "ayr_Latn/train/*.arrow"
- config_name: suz_Deva
data_files:
- split: train
path: "suz_Deva/train/*.arrow"
- config_name: wal_Latn
data_files:
- split: train
path: "wal_Latn/train/*.arrow"
- config_name: mlt_Latn
data_files:
- split: train
path: "mlt_Latn/train/*.arrow"
- config_name: asm_Beng
data_files:
- split: train
path: "asm_Beng/train/*.arrow"
- config_name: san_Deva
data_files:
- split: train
path: "san_Deva/train/*.arrow"
- config_name: kaz_Cyrl
data_files:
- split: train
path: "kaz_Cyrl/train/*.arrow"
- config_name: iba_Latn
data_files:
- split: train
path: "iba_Latn/train/*.arrow"
- config_name: tuk_Latn
data_files:
- split: train
path: "tuk_Latn/train/*.arrow"
- config_name: nso_Latn
data_files:
- split: train
path: "nso_Latn/train/*.arrow"
- config_name: run_Latn
data_files:
- split: train
path: "run_Latn/train/*.arrow"
- config_name: ctu_Latn
data_files:
- split: train
path: "ctu_Latn/train/*.arrow"
- config_name: bam_Latn
data_files:
- split: train
path: "bam_Latn/train/*.arrow"
- config_name: fin_Latn
data_files:
- split: train
path: "fin_Latn/train/*.arrow"
- config_name: gor_Latn
data_files:
- split: train
path: "gor_Latn/train/*.arrow"
- config_name: kmr_Latn
data_files:
- split: train
path: "kmr_Latn/train/*.arrow"
- config_name: pag_Latn
data_files:
- split: train
path: "pag_Latn/train/*.arrow"
- config_name: niu_Latn
data_files:
- split: train
path: "niu_Latn/train/*.arrow"
- config_name: xmf_Geor
data_files:
- split: train
path: "xmf_Geor/train/*.arrow"
- config_name: ekk_Latn
data_files:
- split: train
path: "ekk_Latn/train/*.arrow"
- config_name: lmo_Latn
data_files:
- split: train
path: "lmo_Latn/train/*.arrow"
- config_name: ceb_Latn
data_files:
- split: train
path: "ceb_Latn/train/*.arrow"
- config_name: mhr_Cyrl
data_files:
- split: train
path: "mhr_Cyrl/train/*.arrow"
- config_name: plt_Latn
data_files:
- split: train
path: "plt_Latn/train/*.arrow"
- config_name: qvi_Latn
data_files:
- split: train
path: "qvi_Latn/train/*.arrow"
- config_name: roh_Latn
data_files:
- split: train
path: "roh_Latn/train/*.arrow"
- config_name: aln_Latn
data_files:
- split: train
path: "aln_Latn/train/*.arrow"
- config_name: mah_Latn
data_files:
- split: train
path: "mah_Latn/train/*.arrow"
- config_name: npi_Deva
data_files:
- split: train
path: "npi_Deva/train/*.arrow"
- config_name: tok_Latn
data_files:
- split: train
path: "tok_Latn/train/*.arrow"
- config_name: mgh_Latn
data_files:
- split: train
path: "mgh_Latn/train/*.arrow"
- config_name: eml_Latn
data_files:
- split: train
path: "eml_Latn/train/*.arrow"
- config_name: pnb_Arab
data_files:
- split: train
path: "pnb_Arab/train/*.arrow"
- config_name: nav_Latn
data_files:
- split: train
path: "nav_Latn/train/*.arrow"
- config_name: cat_Latn
data_files:
- split: train
path: "cat_Latn/train/*.arrow"
- config_name: gym_Latn
data_files:
- split: train
path: "gym_Latn/train/*.arrow"
- config_name: sat_Olck
data_files:
- split: train
path: "sat_Olck/train/*.arrow"
- config_name: snd_Arab
data_files:
- split: train
path: "snd_Arab/train/*.arrow"
- config_name: isl_Latn
data_files:
- split: train
path: "isl_Latn/train/*.arrow"
- config_name: kal_Latn
data_files:
- split: train
path: "kal_Latn/train/*.arrow"
- config_name: aoj_Latn
data_files:
- split: train
path: "aoj_Latn/train/*.arrow"
- config_name: zai_Latn
data_files:
- split: train
path: "zai_Latn/train/*.arrow"
- config_name: guj_Gujr
data_files:
- split: train
path: "guj_Gujr/train/*.arrow"
- config_name: min_Latn
data_files:
- split: train
path: "min_Latn/train/*.arrow"
- config_name: grc_Grek
data_files:
- split: train
path: "grc_Grek/train/*.arrow"
- config_name: hmn_Latn
data_files:
- split: train
path: "hmn_Latn/train/*.arrow"
- config_name: ido_Latn
data_files:
- split: train
path: "ido_Latn/train/*.arrow"
- config_name: khm_Khmr
data_files:
- split: train
path: "khm_Khmr/train/*.arrow"
- config_name: quh_Latn
data_files:
- split: train
path: "quh_Latn/train/*.arrow"
- config_name: ikk_Latn
data_files:
- split: train
path: "ikk_Latn/train/*.arrow"
- config_name: iku_Cans
data_files:
- split: train
path: "iku_Cans/train/*.arrow"
- config_name: tat_Latn
data_files:
- split: train
path: "tat_Latn/train/*.arrow"
- config_name: bel_Cyrl
data_files:
- split: train
path: "bel_Cyrl/train/*.arrow"
- config_name: dyu_Latn
data_files:
- split: train
path: "dyu_Latn/train/*.arrow"
- config_name: que_Latn
data_files:
- split: train
path: "que_Latn/train/*.arrow"
- config_name: quw_Latn
data_files:
- split: train
path: "quw_Latn/train/*.arrow"
- config_name: wol_Latn
data_files:
- split: train
path: "wol_Latn/train/*.arrow"
- config_name: hne_Deva
data_files:
- split: train
path: "hne_Deva/train/*.arrow"
- config_name: zho_Hani
data_files:
- split: train
path: "zho_Hani/train/*.arrow"
- config_name: tum_Latn
data_files:
- split: train
path: "tum_Latn/train/*.arrow"
- config_name: swh_Latn
data_files:
- split: train
path: "swh_Latn/train/*.arrow"
- config_name: kua_Latn
data_files:
- split: train
path: "kua_Latn/train/*.arrow"
- config_name: ncj_Latn
data_files:
- split: train
path: "ncj_Latn/train/*.arrow"
- config_name: ewe_Latn
data_files:
- split: train
path: "ewe_Latn/train/*.arrow"
- config_name: hat_Latn
data_files:
- split: train
path: "hat_Latn/train/*.arrow"
- config_name: ina_Latn
data_files:
- split: train
path: "ina_Latn/train/*.arrow"
- config_name: deu_Latn
data_files:
- split: train
path: "deu_Latn/train/*.arrow"
- config_name: ahk_Latn
data_files:
- split: train
path: "ahk_Latn/train/*.arrow"
- config_name: srm_Latn
data_files:
- split: train
path: "srm_Latn/train/*.arrow"
- config_name: lug_Latn
data_files:
- split: train
path: "lug_Latn/train/*.arrow"
- config_name: ach_Latn
data_files:
- split: train
path: "ach_Latn/train/*.arrow"
- config_name: rmy_Latn
data_files:
- split: train
path: "rmy_Latn/train/*.arrow"
- config_name: smo_Latn
data_files:
- split: train
path: "smo_Latn/train/*.arrow"
- config_name: mos_Latn
data_files:
- split: train
path: "mos_Latn/train/*.arrow"
- config_name: srd_Latn
data_files:
- split: train
path: "srd_Latn/train/*.arrow"
- config_name: ltz_Latn
data_files:
- split: train
path: "ltz_Latn/train/*.arrow"
- config_name: srp_Latn
data_files:
- split: train
path: "srp_Latn/train/*.arrow"
- config_name: azb_Arab
data_files:
- split: train
path: "azb_Arab/train/*.arrow"
- config_name: aze_Arab
data_files:
- split: train
path: "aze_Arab/train/*.arrow"
- config_name: ori_Orya
data_files:
- split: train
path: "ori_Orya/train/*.arrow"
- config_name: mzh_Latn
data_files:
- split: train
path: "mzh_Latn/train/*.arrow"
- config_name: kur_Latn
data_files:
- split: train
path: "kur_Latn/train/*.arrow"
- config_name: wbm_Latn
data_files:
- split: train
path: "wbm_Latn/train/*.arrow"
- config_name: crs_Latn
data_files:
- split: train
path: "crs_Latn/train/*.arrow"
- config_name: ada_Latn
data_files:
- split: train
path: "ada_Latn/train/*.arrow"
- config_name: hif_Latn
data_files:
- split: train
path: "hif_Latn/train/*.arrow"
- config_name: jpn_Japn
data_files:
- split: train
path: "jpn_Japn/train/*.arrow"
- config_name: pcm_Latn
data_files:
- split: train
path: "pcm_Latn/train/*.arrow"
- config_name: tso_Latn
data_files:
- split: train
path: "tso_Latn/train/*.arrow"
- config_name: nor_Latn
data_files:
- split: train
path: "nor_Latn/train/*.arrow"
- config_name: bsb_Latn
data_files:
- split: train
path: "bsb_Latn/train/*.arrow"
- config_name: gaa_Latn
data_files:
- split: train
path: "gaa_Latn/train/*.arrow"
- config_name: ukr_Cyrl
data_files:
- split: train
path: "ukr_Cyrl/train/*.arrow"
- config_name: mon_Latn
data_files:
- split: train
path: "mon_Latn/train/*.arrow"
- config_name: nep_Deva
data_files:
- split: train
path: "nep_Deva/train/*.arrow"
- config_name: guj_Deva
data_files:
- split: train
path: "guj_Deva/train/*.arrow"
- config_name: pis_Latn
data_files:
- split: train
path: "pis_Latn/train/*.arrow"
- config_name: lhu_Latn
data_files:
- split: train
path: "lhu_Latn/train/*.arrow"
- config_name: nya_Latn
data_files:
- split: train
path: "nya_Latn/train/*.arrow"
- config_name: poh_Latn
data_files:
- split: train
path: "poh_Latn/train/*.arrow"
- config_name: nnb_Latn
data_files:
- split: train
path: "nnb_Latn/train/*.arrow"
- config_name: grn_Latn
data_files:
- split: train
path: "grn_Latn/train/*.arrow"
- config_name: mco_Latn
data_files:
- split: train
path: "mco_Latn/train/*.arrow"
- config_name: ory_Orya
data_files:
- split: train
path: "ory_Orya/train/*.arrow"
- config_name: ful_Latn
data_files:
- split: train
path: "ful_Latn/train/*.arrow"
- config_name: diq_Latn
data_files:
- split: train
path: "diq_Latn/train/*.arrow"
- config_name: sag_Latn
data_files:
- split: train
path: "sag_Latn/train/*.arrow"
- config_name: afr_Latn
data_files:
- split: train
path: "afr_Latn/train/*.arrow"
- config_name: haw_Latn
data_files:
- split: train
path: "haw_Latn/train/*.arrow"
- config_name: umb_Latn
data_files:
- split: train
path: "umb_Latn/train/*.arrow"
- config_name: hsb_Latn
data_files:
- split: train
path: "hsb_Latn/train/*.arrow"
- config_name: fij_Latn
data_files:
- split: train
path: "fij_Latn/train/*.arrow"
- config_name: hbs_Cyrl
data_files:
- split: train
path: "hbs_Cyrl/train/*.arrow"
- config_name: san_Latn
data_files:
- split: train
path: "san_Latn/train/*.arrow"
- config_name: vls_Latn
data_files:
- split: train
path: "vls_Latn/train/*.arrow"
- config_name: zsm_Latn
data_files:
- split: train
path: "zsm_Latn/train/*.arrow"
- config_name: lij_Latn
data_files:
- split: train
path: "lij_Latn/train/*.arrow"
- config_name: quc_Latn
data_files:
- split: train
path: "quc_Latn/train/*.arrow"
- config_name: mam_Latn
data_files:
- split: train
path: "mam_Latn/train/*.arrow"
- config_name: tls_Latn
data_files:
- split: train
path: "tls_Latn/train/*.arrow"
- config_name: tuc_Latn
data_files:
- split: train
path: "tuc_Latn/train/*.arrow"
- config_name: dan_Latn
data_files:
- split: train
path: "dan_Latn/train/*.arrow"
- config_name: rue_Cyrl
data_files:
- split: train
path: "rue_Cyrl/train/*.arrow"
- config_name: ace_Latn
data_files:
- split: train
path: "ace_Latn/train/*.arrow"
- config_name: bem_Latn
data_files:
- split: train
path: "bem_Latn/train/*.arrow"
- config_name: kam_Latn
data_files:
- split: train
path: "kam_Latn/train/*.arrow"
- config_name: kaa_Latn
data_files:
- split: train
path: "kaa_Latn/train/*.arrow"
- config_name: ndo_Latn
data_files:
- split: train
path: "ndo_Latn/train/*.arrow"
- config_name: oss_Cyrl
data_files:
- split: train
path: "oss_Cyrl/train/*.arrow"
- config_name: lit_Latn
data_files:
- split: train
path: "lit_Latn/train/*.arrow"
- config_name: frr_Latn
data_files:
- split: train
path: "frr_Latn/train/*.arrow"
- config_name: yap_Latn
data_files:
- split: train
path: "yap_Latn/train/*.arrow"
- config_name: bzj_Latn
data_files:
- split: train
path: "bzj_Latn/train/*.arrow"
- config_name: gom_Latn
data_files:
- split: train
path: "gom_Latn/train/*.arrow"
- config_name: swe_Latn
data_files:
- split: train
path: "swe_Latn/train/*.arrow"
- config_name: lfn_Latn
data_files:
- split: train
path: "lfn_Latn/train/*.arrow"
- config_name: cmn_Hani
data_files:
- split: train
path: "cmn_Hani/train/*.arrow"
- config_name: mon_Cyrl
data_files:
- split: train
path: "mon_Cyrl/train/*.arrow"
- config_name: vep_Latn
data_files:
- split: train
path: "vep_Latn/train/*.arrow"
- config_name: ixl_Latn
data_files:
- split: train
path: "ixl_Latn/train/*.arrow"
- config_name: gil_Latn
data_files:
- split: train
path: "gil_Latn/train/*.arrow"
- config_name: mau_Latn
data_files:
- split: train
path: "mau_Latn/train/*.arrow"
- config_name: tsn_Latn
data_files:
- split: train
path: "tsn_Latn/train/*.arrow"
- config_name: aym_Latn
data_files:
- split: train
path: "aym_Latn/train/*.arrow"
- config_name: vec_Latn
data_files:
- split: train
path: "vec_Latn/train/*.arrow"
- config_name: gom_Deva
data_files:
- split: train
path: "gom_Deva/train/*.arrow"
- config_name: fur_Latn
data_files:
- split: train
path: "fur_Latn/train/*.arrow"
- config_name: kin_Latn
data_files:
- split: train
path: "kin_Latn/train/*.arrow"
- config_name: gcr_Latn
data_files:
- split: train
path: "gcr_Latn/train/*.arrow"
- config_name: sgs_Latn
data_files:
- split: train
path: "sgs_Latn/train/*.arrow"
- config_name: bih_Deva
data_files:
- split: train
path: "bih_Deva/train/*.arrow"
- config_name: vie_Latn
data_files:
- split: train
path: "vie_Latn/train/*.arrow"
- config_name: tha_Thai
data_files:
- split: train
path: "tha_Thai/train/*.arrow"
- config_name: pau_Latn
data_files:
- split: train
path: "pau_Latn/train/*.arrow"
- config_name: est_Latn
data_files:
- split: train
path: "est_Latn/train/*.arrow"
- config_name: lue_Latn
data_files:
- split: train
path: "lue_Latn/train/*.arrow"
- config_name: rug_Latn
data_files:
- split: train
path: "rug_Latn/train/*.arrow"
- config_name: kjb_Latn
data_files:
- split: train
path: "kjb_Latn/train/*.arrow"
- config_name: kik_Latn
data_files:
- split: train
path: "kik_Latn/train/*.arrow"
- config_name: mri_Latn
data_files:
- split: train
path: "mri_Latn/train/*.arrow"
- config_name: ber_Latn
data_files:
- split: train
path: "ber_Latn/train/*.arrow"
- config_name: ssw_Latn
data_files:
- split: train
path: "ssw_Latn/train/*.arrow"
- config_name: cab_Latn
data_files:
- split: train
path: "cab_Latn/train/*.arrow"
- config_name: quz_Latn
data_files:
- split: train
path: "quz_Latn/train/*.arrow"
- config_name: arb_Arab
data_files:
- split: train
path: "arb_Arab/train/*.arrow"
- config_name: mai_Deva
data_files:
- split: train
path: "mai_Deva/train/*.arrow"
- config_name: bew_Cyrl
data_files:
- split: train
path: "bew_Cyrl/train/*.arrow"
- config_name: tat_Cyrl
data_files:
- split: train
path: "tat_Cyrl/train/*.arrow"
- config_name: mya_Mymr
data_files:
- split: train
path: "mya_Mymr/train/*.arrow"
- config_name: alt_Cyrl
data_files:
- split: train
path: "alt_Cyrl/train/*.arrow"
- config_name: nno_Latn
data_files:
- split: train
path: "nno_Latn/train/*.arrow"
- config_name: hrx_Latn
data_files:
- split: train
path: "hrx_Latn/train/*.arrow"
- config_name: hau_Latn
data_files:
- split: train
path: "hau_Latn/train/*.arrow"
- config_name: gsw_Latn
data_files:
- split: train
path: "gsw_Latn/train/*.arrow"
- config_name: pam_Latn
data_files:
- split: train
path: "pam_Latn/train/*.arrow"
- config_name: sun_Latn
data_files:
- split: train
path: "sun_Latn/train/*.arrow"
- config_name: lat_Latn
data_files:
- split: train
path: "lat_Latn/train/*.arrow"
- config_name: bis_Latn
data_files:
- split: train
path: "bis_Latn/train/*.arrow"
- config_name: udm_Cyrl
data_files:
- split: train
path: "udm_Cyrl/train/*.arrow"
- config_name: tca_Latn
data_files:
- split: train
path: "tca_Latn/train/*.arrow"
- config_name: uig_Arab
data_files:
- split: train
path: "uig_Arab/train/*.arrow"
- config_name: glg_Latn
data_files:
- split: train
path: "glg_Latn/train/*.arrow"
- config_name: tah_Latn
data_files:
- split: train
path: "tah_Latn/train/*.arrow"
- config_name: ckb_Arab
data_files:
- split: train
path: "ckb_Arab/train/*.arrow"
- config_name: gle_Latn
data_files:
- split: train
path: "gle_Latn/train/*.arrow"
- config_name: lim_Latn
data_files:
- split: train
path: "lim_Latn/train/*.arrow"
- config_name: slk_Latn
data_files:
- split: train
path: "slk_Latn/train/*.arrow"
- config_name: nds_Latn
data_files:
- split: train
path: "nds_Latn/train/*.arrow"
- config_name: kor_Hang
data_files:
- split: train
path: "kor_Hang/train/*.arrow"
- config_name: uzb_Latn
data_files:
- split: train
path: "uzb_Latn/train/*.arrow"
- config_name: pfl_Latn
data_files:
- split: train
path: "pfl_Latn/train/*.arrow"
- config_name: azj_Latn
data_files:
- split: train
path: "azj_Latn/train/*.arrow"
- config_name: tgk_Cyrl
data_files:
- split: train
path: "tgk_Cyrl/train/*.arrow"
- config_name: glv_Latn
data_files:
- split: train
path: "glv_Latn/train/*.arrow"
- config_name: jam_Latn
data_files:
- split: train
path: "jam_Latn/train/*.arrow"
- config_name: kat_Geor
data_files:
- split: train
path: "kat_Geor/train/*.arrow"
- config_name: fry_Latn
data_files:
- split: train
path: "fry_Latn/train/*.arrow"
- config_name: kat_Latn
data_files:
- split: train
path: "kat_Latn/train/*.arrow"
- config_name: twi_Latn
data_files:
- split: train
path: "twi_Latn/train/*.arrow"
- config_name: eus_Latn
data_files:
- split: train
path: "eus_Latn/train/*.arrow"
- config_name: toi_Latn
data_files:
- split: train
path: "toi_Latn/train/*.arrow"
- config_name: mlg_Latn
data_files:
- split: train
path: "mlg_Latn/train/*.arrow"
- config_name: tyv_Cyrl
data_files:
- split: train
path: "tyv_Cyrl/train/*.arrow"
- config_name: arz_Arab
data_files:
- split: train
path: "arz_Arab/train/*.arrow"
- config_name: hyw_Armn
data_files:
- split: train
path: "hyw_Armn/train/*.arrow"
- config_name: chk_Latn
data_files:
- split: train
path: "chk_Latn/train/*.arrow"
- config_name: vol_Latn
data_files:
- split: train
path: "vol_Latn/train/*.arrow"
- config_name: kek_Latn
data_files:
- split: train
path: "kek_Latn/train/*.arrow"
- config_name: teo_Latn
data_files:
- split: train
path: "teo_Latn/train/*.arrow"
- config_name: ell_Grek
data_files:
- split: train
path: "ell_Grek/train/*.arrow"
- config_name: kan_Knda
data_files:
- split: train
path: "kan_Knda/train/*.arrow"
- config_name: tpi_Latn
data_files:
- split: train
path: "tpi_Latn/train/*.arrow"
- config_name: rop_Latn
data_files:
- split: train
path: "rop_Latn/train/*.arrow"
- config_name: lua_Latn
data_files:
- split: train
path: "lua_Latn/train/*.arrow"
- config_name: mad_Latn
data_files:
- split: train
path: "mad_Latn/train/*.arrow"
- config_name: top_Latn
data_files:
- split: train
path: "top_Latn/train/*.arrow"
- config_name: scn_Latn
data_files:
- split: train
path: "scn_Latn/train/*.arrow"
- config_name: war_Latn
data_files:
- split: train
path: "war_Latn/train/*.arrow"
- config_name: ngl_Latn
data_files:
- split: train
path: "ngl_Latn/train/*.arrow"
- config_name: mal_Mlym
data_files:
- split: train
path: "mal_Mlym/train/*.arrow"
- config_name: szl_Latn
data_files:
- split: train
path: "szl_Latn/train/*.arrow"
- config_name: orm_Latn
data_files:
- split: train
path: "orm_Latn/train/*.arrow"
- config_name: urd_Arab
data_files:
- split: train
path: "urd_Arab/train/*.arrow"
- config_name: cbk_Latn
data_files:
- split: train
path: "cbk_Latn/train/*.arrow"
- config_name: tgk_Arab
data_files:
- split: train
path: "tgk_Arab/train/*.arrow"
multilinguality:
- multilingual
pinned: true
tags:
- multilingual
language:
- abk
- ace
- ach
- acm
- acr
- ada
- afb
- afr
- ahk
- ajp
- aka
- aln
- als
- alt
- amh
- aoj
- apc
- ara
- arb
- arg
- arn
- ary
- arz
- asm
- ast
- aym
- ayr
- azb
- aze
- azj
- bak
- bam
- ban
- bar
- bcl
- bel
- bem
- ber
- bew
- bih
- bik
- bis
- bjn
- bod
- bos
- bpy
- bqc
- bre
- bsb
- bul
- bzj
- cab
- cak
- cat
- cbk
- ceb
- ces
- che
- chk
- chv
- cjk
- ckb
- cmn
- cos
- crh
- crs
- csb
- csy
- ctu
- cuk
- cym
- dan
- deu
- diq
- div
- djk
- dtp
- dyu
- dzo
- ekk
- ell
- eml
- eng
- enm
- epo
- est
- eus
- ewe
- ext
- fao
- fas
- fij
- fil
- fin
- fon
- fra
- frr
- fry
- ful
- fur
- gaa
- gcf
- gcr
- gil
- gla
- gle
- glg
- glv
- gom
- gor
- grc
- grn
- gsw
- guc
- gug
- guj
- gym
- hat
- hau
- haw
- hbo
- hbs
- heb
- hif
- hil
- hin
- hmn
- hmo
- hne
- hnj
- hrv
- hrx
- hsb
- hui
- hun
- hus
- hye
- hyw
- iba
- ibo
- ido
- ikk
- iku
- ile
- ilo
- ina
- ind
- isl
- ita
- ixl
- jam
- jav
- jbo
- jpn
- kaa
- kab
- kac
- kal
- kam
- kan
- kat
- kaz
- kbd
- kbp
- kea
- kek
- khm
- kik
- kin
- kir
- kjb
- kjh
- kmb
- kmr
- knv
- kom
- kon
- kor
- kos
- kpg
- krc
- ksd
- ksh
- ksw
- kua
- kur
- lao
- lat
- lfn
- lhu
- lij
- lim
- lin
- lit
- lmo
- ltz
- lua
- lue
- lug
- luo
- lus
- lvs
- lzh
- mad
- mah
- mai
- mal
- mam
- mar
- mau
- mco
- meu
- mgh
- mhr
- min
- miq
- mkd
- mlg
- mlt
- mon
- mos
- mps
- mri
- msa
- mwl
- mya
- myv
- mzh
- mzn
- nan
- nap
- naq
- nav
- nbl
- nch
- ncj
- nde
- ndo
- nds
- nep
- new
- ngl
- ngu
- niu
- nld
- nnb
- nno
- nob
- nor
- npi
- nso
- nya
- nyu
- oci
- ori
- orm
- ory
- oss
- ote
- pag
- pam
- pan
- pap
- pau
- pcd
- pcm
- pes
- pfl
- pis
- pls
- plt
- pms
- pnb
- poh
- pol
- pon
- por
- prs
- pus
- qub
- quc
- que
- quh
- quw
- quy
- quz
- qvi
- rap
- rmy
- roh
- ron
- rop
- rue
- rug
- run
- sag
- sah
- san
- sat
- scn
- sco
- seh
- sgs
- sin
- slk
- slv
- sme
- smo
- sna
- snd
- som
- sot
- spa
- sqi
- srd
- srm
- srn
- srp
- ssw
- sun
- suz
- swa
- swc
- swe
- swh
- szl
- tah
- tam
- tat
- tbz
- tca
- tdt
- teo
- tgk
- tgl
- tha
- tir
- tlh
- tls
- toi
- toj
- tok
- ton
- top
- tpi
- tsn
- tso
- tuc
- tuk
- tum
- tur
- tvl
- twi
- tyv
- tzo
- udm
- uig
- ukr
- umb
- urd
- uzb
- uzn
- vec
- ven
- vep
- vie
- vls
- vol
- wal
- war
- wbm
- wln
- wol
- wuu
- xav
- xho
- xmf
- yao
- yap
- yid
- yom
- yor
- yue
- zai
- zea
- zho
- zlm
- zsm
- zul
pretty_name: Glot500 Corpus
---
# Glot500 Corpus
A dataset of natural language data collected by putting together more than 150
existing mono-lingual and multilingual datasets together and crawling known multilingual websites.
The focus of this dataset is on 500 extremely low-resource languages.
(More Languages still to be uploaded here)
This dataset is used to train the [Glot500](https://huggingface.co/cis-lmu/glot500-base) model.
- **Homepage:** [homepage](https://github.com/cisnlp/Glot500)
- **Repository:** [github](https://github.com/cisnlp/Glot500)
- **Paper:** [acl](https://aclanthology.org/2023.acl-long.61/), [arxiv](https://arxiv.org/abs/2305.12182)
This dataset has the identical data format as the [Taxi1500 Raw Data](https://huggingface.co/datasets/cis-lmu/Taxi1500-RawData) dataset, so that both datasets can be used in parallel seamlessly.
Parts of the original Glot500 dataset cannot be published publicly.
Please fill out [thi form]{https://docs.google.com/forms/d/1FHto_4wWYvEF3lz7DDo3P8wQqfS3WhpYfAu5vM95-qU/viewform?edit_requested=true} to get access to these parts.
## Usage
Replace `nbl_Latn` with your specific language.
```python
from datasets import load_dataset
dataset = load_dataset('cis-lmu/Glot500', 'nbl_Latn', split='train')
print(dataset['train'][0]) # First row of nbl_Latn
```
<details>
<summary>Click to show supported languages:</summary>
```
ton_Latn
nld_Latn
tzo_Latn
leh_Latn
cuk_Latn
ibg_Latn
uzb_Cyrl
jav_Latn
rap_Latn
zpa_Latn
bak_Cyrl
por_Latn
quy_Latn
ast_Latn
cos_Latn
fon_Latn
sna_Latn
dzo_Tibt
nob_Latn
nch_Latn
ish_Latn
che_Cyrl
ext_Latn
ldi_Latn
dtp_Latn
yue_Hani
kbd_Cyrl
mar_Deva
ron_Latn
acr_Latn
afb_Arab
sqi_Latn
eng_Latn
ksd_Latn
rus_Cyrl
bcl_Latn
ksh_Latn
hin_Latn
myv_Cyrl
kjh_Cyrl
sah_Cyrl
gkp_Latn
naq_Latn
tdt_Latn
rmn_Cyrl
kac_Latn
cak_Latn
kir_Cyrl
mps_Latn
yid_Hebr
dhv_Latn
srn_Latn
div_Thaa
mkd_Cyrl
idu_Latn
bre_Latn
bas_Latn
ven_Latn
pxm_Latn
wuu_Hani
mwl_Latn
miq_Latn
kss_Latn
wes_Latn
slv_Latn
hrv_Latn
hmo_Latn
som_Latn
bod_Tibt
pls_Latn
ile_Latn
luo_Latn
pus_Arab
fao_Latn
fas_Arab
swa_Latn
ifb_Latn
ary_Arab
tbz_Latn
hus_Latn
ote_Latn
ilo_Latn
ctd_Latn
abk_Cyrl
bqc_Latn
hil_Latn
pon_Latn
zul_Latn
als_Latn
pes_Arab
bpy_Beng
bos_Latn
sot_Latn
lin_Latn
tuk_Cyrl
gla_Latn
wln_Latn
apc_Arab
hin_Deva
hye_Armn
tir_Ethi
pap_Latn
gcf_Latn
cjk_Latn
pcd_Latn
tur_Latn
kon_Latn
mwn_Latn
izz_Latn
xho_Latn
lam_Latn
guc_Latn
aka_Latn
kea_Latn
sme_Latn
fat_Latn
csb_Latn
bak_Latn
djk_Latn
xav_Latn
oci_Latn
acm_Arab
rmy_Cyrl
bim_Latn
mck_Latn
krc_Cyrl
cym_Latn
lus_Latn
ncx_Latn
ngu_Latn
yom_Latn
tam_Taml
ajp_Arab
epo_Latn
fra_Latn
ita_Latn
seh_Latn
sxn_Latn
pdt_Latn
hbs_Latn
uzn_Cyrl
bhw_Latn
ksw_Mymr
pms_Latn
zlm_Latn
ami_Latn
qub_Latn
twx_Latn
tsz_Latn
kaa_Cyrl
toj_Latn
toh_Latn
kos_Latn
ogo_Latn
kab_Latn
pan_Guru
nan_Latn
aze_Latn
prk_Latn
ara_Arab
meu_Latn
nba_Latn
lvs_Latn
nbl_Latn
loz_Latn
crh_Latn
bci_Latn
kbp_Latn
tgl_Latn
kmb_Latn
hun_Latn
nzi_Latn
yao_Latn
arn_Latn
hyw_Cyrl
vmw_Latn
jbo_Latn
mzn_Arab
lzh_Hani
heb_Hebr
cce_Latn
bjn_Latn
gug_Latn
yor_Latn
ban_Latn
tlh_Latn
chv_Cyrl
sin_Sinh
ind_Latn
dua_Latn
sid_Latn
amh_Ethi
zea_Latn
kpg_Latn
crh_Cyrl
nyu_Latn
dln_Latn
ibo_Latn
tih_Latn
msa_Latn
nap_Latn
mgr_Latn
bik_Latn
srp_Cyrl
lao_Laoo
guw_Latn
kom_Cyrl
sop_Latn
nde_Latn
hui_Latn
cfm_Latn
new_Deva
kur_Arab
sco_Latn
nyk_Latn
lun_Latn
suz_Deva
wal_Latn
asm_Beng
rar_Latn
san_Deva
kaz_Cyrl
tog_Latn
iba_Latn
tuk_Latn
nso_Latn
run_Latn
ctu_Latn
bam_Latn
fin_Latn
gor_Latn
kmr_Latn
ben_Beng
pag_Latn
niu_Latn
xmf_Geor
ekk_Latn
tsc_Latn
lmo_Latn
mhr_Cyrl
plt_Latn
qvi_Latn
roh_Latn
oke_Latn
mah_Latn
tok_Latn
mgh_Latn
eml_Latn
urh_Latn
pnb_Arab
yua_Latn
nav_Latn
zne_Latn
bin_Latn
cat_Latn
gym_Latn
sat_Olck
snd_Arab
isl_Latn
rmn_Grek
bba_Latn
kal_Latn
aoj_Latn
qug_Latn
zai_Latn
guj_Gujr
min_Latn
tob_Latn
grc_Grek
hmn_Latn
ido_Latn
khm_Khmr
ikk_Latn
iku_Cans
tat_Latn
bel_Cyrl
dyu_Latn
que_Latn
efi_Latn
quw_Latn
nyn_Latn
wol_Latn
hne_Deva
zho_Hani
swh_Latn
bum_Latn
kua_Latn
ncj_Latn
ewe_Latn
hat_Latn
ina_Latn
mfe_Latn
ahk_Latn
srm_Latn
lug_Latn
ach_Latn
rmy_Latn
tpm_Latn
smo_Latn
mos_Latn
srd_Latn
srp_Latn
azb_Arab
ori_Orya
mzh_Latn
kur_Latn
phm_Latn
kwn_Latn
crs_Latn
ada_Latn
ttj_Latn
hif_Latn
tzh_Latn
tdx_Latn
bbc_Latn
cnh_Latn
pcm_Latn
tso_Latn
nor_Latn
bsb_Latn
kqn_Latn
gaa_Latn
ukr_Cyrl
lav_Latn
nep_Deva
kmr_Cyrl
ige_Latn
pis_Latn
lhu_Latn
nya_Latn
tiv_Latn
mny_Latn
kri_Latn
nyy_Latn
poh_Latn
nnb_Latn
grn_Latn
mco_Latn
ory_Orya
ful_Latn
diq_Latn
sag_Latn
tel_Telu
afr_Latn
haw_Latn
umb_Latn
hsb_Latn
fij_Latn
hbs_Cyrl
san_Latn
vls_Latn
zsm_Latn
lij_Latn
quc_Latn
mam_Latn
tuc_Latn
dan_Latn
rue_Cyrl
ace_Latn
bem_Latn
kam_Latn
ndo_Latn
mbb_Latn
mrw_Latn
ajg_Latn
oss_Cyrl
her_Latn
lit_Latn
frr_Latn
yap_Latn
bzj_Latn
gom_Latn
swe_Latn
lfn_Latn
cmn_Hani
mon_Cyrl
vep_Latn
ixl_Latn
gil_Latn
mau_Latn
aym_Latn
gom_Deva
fur_Latn
cgg_Latn
chw_Latn
kin_Latn
alz_Latn
ndc_Latn
gcr_Latn
rmn_Latn
sgs_Latn
bih_Deva
skg_Latn
bts_Latn
vie_Latn
tha_Thai
tcf_Latn
pau_Latn
est_Latn
lue_Latn
rug_Latn
gur_Latn
kik_Latn
mri_Latn
ber_Latn
ssw_Latn
cab_Latn
quz_Latn
arb_Arab
mai_Deva
tat_Cyrl
mya_Mymr
alt_Cyrl
nno_Latn
nse_Latn
hrx_Latn
hau_Latn
koo_Latn
gsw_Latn
pam_Latn
sun_Latn
lat_Latn
bis_Latn
btx_Latn
udm_Cyrl
xmv_Latn
tca_Latn
uig_Arab
glg_Latn
tah_Latn
llb_Latn
ckb_Arab
gle_Latn
lim_Latn
slk_Latn
nds_Latn
kor_Hang
uzb_Latn
gkn_Latn
pfl_Latn
azj_Latn
glv_Latn
jam_Latn
kat_Geor
abn_Latn
fry_Latn
kat_Latn
twi_Latn
eus_Latn
toi_Latn
mlg_Latn
ifa_Latn
tyv_Cyrl
arz_Arab
chk_Latn
vol_Latn
kek_Latn
teo_Latn
ell_Grek
kan_Knda
rng_Latn
tpi_Latn
mdy_Ethi
lua_Latn
mad_Latn
top_Latn
scn_Latn
ngl_Latn
mal_Mlym
szl_Latn
orm_Latn
nia_Latn
urd_Arab
mxv_Latn
cbk_Latn
```
</details>
## License
We don't own any part of the data. The original source of each sentence of the data is indicated in dataset field.
To see the copyright license of the original datasets visit [here](https://github.com/cisnlp/Glot500#glot500-c).
We license the actual packaging, the metadata and the annotations of these data under the cc0-1.0.
If you are a website/dataset owner and do not want your data to be included in this corpra, please send us an email at glot500@cis.lmu.de.
## Ethical Considerations
**1. Biases:** The text corpus may reflect the perspectives, opinions, or demographics of its sources or creators. It is important for users to critically evaluate the text in context especially for news sources and social medias.
**2. Representativeness:** While we have aimed for diversity and inclusivity, the text corpus may not fully represent all native speakers. Users should be mindful of any potential underrepresentation.
**3. Ethics:** We acknowledge that the collection and use of text data can have ethical implications. We have strived to handle the data responsibly, but we encourage users to consider the broader ethical implications of their own research or applications.
## Citation
If you use any part of this code and data in your research, please cite it using the following BibTeX entry.
```
@inproceedings{imanigooghari-etal-2023-glot500,
title = "Glot500: Scaling Multilingual Corpora and Language Models to 500 Languages",
author = {ImaniGooghari, Ayyoob and
Lin, Peiqin and
Kargaran, Amir Hossein and
Severini, Silvia and
Jalili Sabet, Masoud and
Kassner, Nora and
Ma, Chunlan and
Schmid, Helmut and
Martins, Andr{\'e} and
Yvon, Fran{\c{c}}ois and
Sch{\"u}tze, Hinrich},
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.61",
doi = "10.18653/v1/2023.acl-long.61",
pages = "1082--1117",
abstract = "The NLP community has mainly focused on scaling Large Language Models (LLMs) vertically, i.e., making them better for about 100 languages. We instead scale LLMs horizontally: we create, through continued pretraining, Glot500-m, an LLM that covers 511 predominantly low-resource languages. An important part of this effort is to collect and clean Glot500-c, a corpus that covers these 511 languages and allows us to train Glot500-m. We evaluate Glot500-m on five diverse tasks across these languages. We observe large improvements for both high-resource and low-resource languages compared to an XLM-R baseline. Our analysis shows that no single factor explains the quality of multilingual LLM representations. Rather, a combination of factors determines quality including corpus size, script, {``}help{''} from related languages and the total capacity of the model. Our work addresses an important goal of NLP research: we should notlimit NLP to a small fraction of the world{'}s languages and instead strive to support as many languages as possible to bring the benefits of NLP technology to all languages and cultures. Code, data and models are available at \url{https://github.com/cisnlp/Glot500}.",
}
```
提供机构:
cis-lmu
原始信息汇总
数据集概述
数据来源
- 本数据集整合了超过150个现有的单语和多语数据集。
- 通过爬取已知的多语种网站收集数据。
数据特点
- 专注于500种极低资源语言。
- 更多语言数据仍在持续上传中。



