five

Yusser/CC-LARD

收藏
Hugging Face2026-03-19 更新2026-04-12 收录
下载链接:
https://hf-mirror.com/datasets/Yusser/CC-LARD
下载链接
链接失效反馈
官方服务:
资源简介:
--- language: - multilingual license: cc-by-4.0 task_categories: - text-generation - text-classification tags: - common-crawl - multilingual - regional-corpus - geographic size_categories: - 10M<n<100M configs: - config_name: "af-AF" data_files: - split: train path: "data/af-AF/*.parquet" - config_name: "af-BR" data_files: - split: train path: "data/af-BR/*.parquet" - config_name: "af-CO" data_files: - split: train path: "data/af-CO/*.parquet" - config_name: "af-CZ" data_files: - split: train path: "data/af-CZ/*.parquet" - config_name: "af-ES" data_files: - split: train path: "data/af-ES/*.parquet" - config_name: "af-GB" data_files: - split: train path: "data/af-GB/*.parquet" - config_name: "af-IT" data_files: - split: train path: "data/af-IT/*.parquet" - config_name: "af-NA" data_files: - split: train path: "data/af-NA/*.parquet" - config_name: "af-NL" data_files: - split: train path: "data/af-NL/*.parquet" - config_name: "af-NU" data_files: - split: train path: "data/af-NU/*.parquet" - config_name: "af-RU" data_files: - split: train path: "data/af-RU/*.parquet" - config_name: "af-SA" data_files: - split: train path: "data/af-SA/*.parquet" - config_name: "af-TK" data_files: - split: train path: "data/af-TK/*.parquet" - config_name: "af-US" data_files: - split: train path: "data/af-US/*.parquet" - config_name: "af-VN" data_files: - split: train path: "data/af-VN/*.parquet" - config_name: "af-WS" data_files: - split: train path: "data/af-WS/*.parquet" - config_name: "af-XX" data_files: - split: train path: "data/af-XX/*.parquet" - config_name: "af-ZA" data_files: - split: train path: "data/af-ZA/*.parquet" - config_name: "als-DE" data_files: - split: train path: "data/als-DE/*.parquet" - config_name: "als-GB" data_files: - split: train path: "data/als-GB/*.parquet" - config_name: "als-IT" data_files: - split: train path: "data/als-IT/*.parquet" - config_name: "als-US" data_files: - split: train path: "data/als-US/*.parquet" - config_name: "als-XX" data_files: - split: train path: "data/als-XX/*.parquet" - config_name: "am-AM" data_files: - split: train path: "data/am-AM/*.parquet" - config_name: "am-CA" data_files: - split: train path: "data/am-CA/*.parquet" - config_name: "am-CC" data_files: - split: train path: "data/am-CC/*.parquet" - config_name: "am-DE" data_files: - split: train path: "data/am-DE/*.parquet" - config_name: "am-ET" data_files: - split: train path: "data/am-ET/*.parquet" - config_name: "am-FI" data_files: - split: train path: "data/am-FI/*.parquet" - config_name: "am-FR" data_files: - split: train path: "data/am-FR/*.parquet" - config_name: "am-GB" data_files: - split: train path: "data/am-GB/*.parquet" - config_name: "am-NL" data_files: - split: train path: "data/am-NL/*.parquet" - config_name: "am-PL" data_files: - split: train path: "data/am-PL/*.parquet" - config_name: "am-SE" data_files: - split: train path: "data/am-SE/*.parquet" - config_name: "am-TI" data_files: - split: train path: "data/am-TI/*.parquet" - config_name: "am-US" data_files: - split: train path: "data/am-US/*.parquet" - config_name: "am-VA" data_files: - split: train path: "data/am-VA/*.parquet" - config_name: "am-VN" data_files: - split: train path: "data/am-VN/*.parquet" - config_name: "am-XX" data_files: - split: train path: "data/am-XX/*.parquet" - config_name: "an-XX" data_files: - split: train path: "data/an-XX/*.parquet" - config_name: "ar-AA" data_files: - split: train path: "data/ar-AA/*.parquet" - config_name: "ar-AB" data_files: - split: train path: "data/ar-AB/*.parquet" - config_name: "ar-AE" data_files: - split: train path: "data/ar-AE/*.parquet" - config_name: "ar-AF" data_files: - split: train path: "data/ar-AF/*.parquet" - config_name: "ar-AI" data_files: - split: train path: "data/ar-AI/*.parquet" - config_name: "ar-AL" data_files: - split: train path: "data/ar-AL/*.parquet" - config_name: "ar-AM" data_files: - split: train path: "data/ar-AM/*.parquet" - config_name: "ar-AR" data_files: - split: train path: "data/ar-AR/*.parquet" - config_name: "ar-AS" data_files: - split: train path: "data/ar-AS/*.parquet" - config_name: "ar-AT" data_files: - split: train path: "data/ar-AT/*.parquet" - config_name: "ar-AU" data_files: - split: train path: "data/ar-AU/*.parquet" - config_name: "ar-AZ" data_files: - split: train path: "data/ar-AZ/*.parquet" - config_name: "ar-BE" data_files: - split: train path: "data/ar-BE/*.parquet" - config_name: "ar-BH" data_files: - split: train path: "data/ar-BH/*.parquet" - config_name: "ar-BR" data_files: - split: train path: "data/ar-BR/*.parquet" - config_name: "ar-BZ" data_files: - split: train path: "data/ar-BZ/*.parquet" - config_name: "ar-CA" data_files: - split: train path: "data/ar-CA/*.parquet" - config_name: "ar-CC" data_files: - split: train path: "data/ar-CC/*.parquet" - config_name: "ar-CF" data_files: - split: train path: "data/ar-CF/*.parquet" - config_name: "ar-CH" data_files: - split: train path: "data/ar-CH/*.parquet" - config_name: "ar-CL" data_files: - split: train path: "data/ar-CL/*.parquet" - config_name: "ar-CN" data_files: - split: train path: "data/ar-CN/*.parquet" - config_name: "ar-CO" data_files: - split: train path: "data/ar-CO/*.parquet" - config_name: "ar-CU" data_files: - split: train path: "data/ar-CU/*.parquet" - config_name: "ar-CZ" data_files: - split: train path: "data/ar-CZ/*.parquet" - config_name: "ar-DE" data_files: - split: train path: "data/ar-DE/*.parquet" - config_name: "ar-DJ" data_files: - split: train path: "data/ar-DJ/*.parquet" - config_name: "ar-DK" data_files: - split: train path: "data/ar-DK/*.parquet" - config_name: "ar-DZ" data_files: - split: train path: "data/ar-DZ/*.parquet" - config_name: "ar-EG" data_files: - split: train path: "data/ar-EG/*.parquet" - config_name: "ar-EN" data_files: - split: train path: "data/ar-EN/*.parquet" - config_name: "ar-ES" data_files: - split: train path: "data/ar-ES/*.parquet" - config_name: "ar-EU" data_files: - split: train path: "data/ar-EU/*.parquet" - config_name: "ar-FI" data_files: - split: train path: "data/ar-FI/*.parquet" - config_name: "ar-FM" data_files: - split: train path: "data/ar-FM/*.parquet" - config_name: "ar-FR" data_files: - split: train path: "data/ar-FR/*.parquet" - config_name: "ar-GB" data_files: - split: train path: "data/ar-GB/*.parquet" - config_name: "ar-GG" data_files: - split: train path: "data/ar-GG/*.parquet" - config_name: "ar-GP" data_files: - split: train path: "data/ar-GP/*.parquet" - config_name: "ar-GR" data_files: - split: train path: "data/ar-GR/*.parquet" - config_name: "ar-HK" data_files: - split: train path: "data/ar-HK/*.parquet" - config_name: "ar-HR" data_files: - split: train path: "data/ar-HR/*.parquet" - config_name: "ar-HT" data_files: - split: train path: "data/ar-HT/*.parquet" - config_name: "ar-ID" data_files: - split: train path: "data/ar-ID/*.parquet" - config_name: "ar-IE" data_files: - split: train path: "data/ar-IE/*.parquet" - config_name: "ar-IL" data_files: - split: train path: "data/ar-IL/*.parquet" - config_name: "ar-IM" data_files: - split: train path: "data/ar-IM/*.parquet" - config_name: "ar-IN" data_files: - split: train path: "data/ar-IN/*.parquet" - config_name: "ar-IO" data_files: - split: train path: "data/ar-IO/*.parquet" - config_name: "ar-IQ" data_files: - split: train path: "data/ar-IQ/*.parquet" - config_name: "ar-IR" data_files: - split: train path: "data/ar-IR/*.parquet" - config_name: "ar-IS" data_files: - split: train path: "data/ar-IS/*.parquet" - config_name: "ar-IT" data_files: - split: train path: "data/ar-IT/*.parquet" - config_name: "ar-JO" data_files: - split: train path: "data/ar-JO/*.parquet" - config_name: "ar-JP" data_files: - split: train path: "data/ar-JP/*.parquet" - config_name: "ar-KI" data_files: - split: train path: "data/ar-KI/*.parquet" - config_name: "ar-KM" data_files: - split: train path: "data/ar-KM/*.parquet" - config_name: "ar-KR" data_files: - split: train path: "data/ar-KR/*.parquet" - config_name: "ar-KW" data_files: - split: train path: "data/ar-KW/*.parquet" - config_name: "ar-KZ" data_files: - split: train path: "data/ar-KZ/*.parquet" - config_name: "ar-LA" data_files: - split: train path: "data/ar-LA/*.parquet" - config_name: "ar-LB" data_files: - split: train path: "data/ar-LB/*.parquet" - config_name: "ar-LI" data_files: - split: train path: "data/ar-LI/*.parquet" - config_name: "ar-LT" data_files: - split: train path: "data/ar-LT/*.parquet" - config_name: "ar-LV" data_files: - split: train path: "data/ar-LV/*.parquet" - config_name: "ar-LY" data_files: - split: train path: "data/ar-LY/*.parquet" - config_name: "ar-MA" data_files: - split: train path: "data/ar-MA/*.parquet" - config_name: "ar-ME" data_files: - split: train path: "data/ar-ME/*.parquet" - config_name: "ar-ML" data_files: - split: train path: "data/ar-ML/*.parquet" - config_name: "ar-MP" data_files: - split: train path: "data/ar-MP/*.parquet" - config_name: "ar-MR" data_files: - split: train path: "data/ar-MR/*.parquet" - config_name: "ar-MX" data_files: - split: train path: "data/ar-MX/*.parquet" - config_name: "ar-MY" data_files: - split: train path: "data/ar-MY/*.parquet" - config_name: "ar-NA" data_files: - split: train path: "data/ar-NA/*.parquet" - config_name: "ar-NG" data_files: - split: train path: "data/ar-NG/*.parquet" - config_name: "ar-NL" data_files: - split: train path: "data/ar-NL/*.parquet" - config_name: "ar-NO" data_files: - split: train path: "data/ar-NO/*.parquet" - config_name: "ar-NU" data_files: - split: train path: "data/ar-NU/*.parquet" - config_name: "ar-NZ" data_files: - split: train path: "data/ar-NZ/*.parquet" - config_name: "ar-OM" data_files: - split: train path: "data/ar-OM/*.parquet" - config_name: "ar-PE" data_files: - split: train path: "data/ar-PE/*.parquet" - config_name: "ar-PK" data_files: - split: train path: "data/ar-PK/*.parquet" - config_name: "ar-PL" data_files: - split: train path: "data/ar-PL/*.parquet" - config_name: "ar-PS" data_files: - split: train path: "data/ar-PS/*.parquet" - config_name: "ar-PT" data_files: - split: train path: "data/ar-PT/*.parquet" - config_name: "ar-PW" data_files: - split: train path: "data/ar-PW/*.parquet" - config_name: "ar-QA" data_files: - split: train path: "data/ar-QA/*.parquet" - config_name: "ar-RO" data_files: - split: train path: "data/ar-RO/*.parquet" - config_name: "ar-RS" data_files: - split: train path: "data/ar-RS/*.parquet" - config_name: "ar-RU" data_files: - split: train path: "data/ar-RU/*.parquet" - config_name: "ar-SA" data_files: - split: train path: "data/ar-SA/*.parquet" - config_name: "ar-SC" data_files: - split: train path: "data/ar-SC/*.parquet" - config_name: "ar-SD" data_files: - split: train path: "data/ar-SD/*.parquet" - config_name: "ar-SE" data_files: - split: train path: "data/ar-SE/*.parquet" - config_name: "ar-SG" data_files: - split: train path: "data/ar-SG/*.parquet" - config_name: "ar-SH" data_files: - split: train path: "data/ar-SH/*.parquet" - config_name: "ar-SI" data_files: - split: train path: "data/ar-SI/*.parquet" - config_name: "ar-SK" data_files: - split: train path: "data/ar-SK/*.parquet" - config_name: "ar-SN" data_files: - split: train path: "data/ar-SN/*.parquet" - config_name: "ar-SO" data_files: - split: train path: "data/ar-SO/*.parquet" - config_name: "ar-ST" data_files: - split: train path: "data/ar-ST/*.parquet" - config_name: "ar-SX" data_files: - split: train path: "data/ar-SX/*.parquet" - config_name: "ar-SY" data_files: - split: train path: "data/ar-SY/*.parquet" - config_name: "ar-TC" data_files: - split: train path: "data/ar-TC/*.parquet" - config_name: "ar-TD" data_files: - split: train path: "data/ar-TD/*.parquet" - config_name: "ar-TH" data_files: - split: train path: "data/ar-TH/*.parquet" - config_name: "ar-TK" data_files: - split: train path: "data/ar-TK/*.parquet" - config_name: "ar-TL" data_files: - split: train path: "data/ar-TL/*.parquet" - config_name: "ar-TM" data_files: - split: train path: "data/ar-TM/*.parquet" - config_name: "ar-TN" data_files: - split: train path: "data/ar-TN/*.parquet" - config_name: "ar-TO" data_files: - split: train path: "data/ar-TO/*.parquet" - config_name: "ar-TR" data_files: - split: train path: "data/ar-TR/*.parquet" - config_name: "ar-TT" data_files: - split: train path: "data/ar-TT/*.parquet" - config_name: "ar-TV" data_files: - split: train path: "data/ar-TV/*.parquet" - config_name: "ar-TW" data_files: - split: train path: "data/ar-TW/*.parquet" - config_name: "ar-UA" data_files: - split: train path: "data/ar-UA/*.parquet" - config_name: "ar-US" data_files: - split: train path: "data/ar-US/*.parquet" - config_name: "ar-UZ" data_files: - split: train path: "data/ar-UZ/*.parquet" - config_name: "ar-VA" data_files: - split: train path: "data/ar-VA/*.parquet" - config_name: "ar-VN" data_files: - split: train path: "data/ar-VN/*.parquet" - config_name: "ar-VU" data_files: - split: train path: "data/ar-VU/*.parquet" - config_name: "ar-WS" data_files: - split: train path: "data/ar-WS/*.parquet" - config_name: "ar-WW" data_files: - split: train path: "data/ar-WW/*.parquet" - config_name: "ar-XA" data_files: - split: train path: "data/ar-XA/*.parquet" - config_name: "ar-XM" data_files: - split: train path: "data/ar-XM/*.parquet" - config_name: "ar-XX" data_files: - split: train path: "data/ar-XX/*.parquet" - config_name: "ar-YE" data_files: - split: train path: "data/ar-YE/*.parquet" - config_name: "ar-ZA" data_files: - split: train path: "data/ar-ZA/*.parquet" - config_name: "arz-AR" data_files: - split: train path: "data/arz-AR/*.parquet" - config_name: "arz-EG" data_files: - split: train path: "data/arz-EG/*.parquet" - config_name: "arz-RU" data_files: - split: train path: "data/arz-RU/*.parquet" - config_name: "arz-ST" data_files: - split: train path: "data/arz-ST/*.parquet" - config_name: "arz-US" data_files: - split: train path: "data/arz-US/*.parquet" - config_name: "arz-XX" data_files: - split: train path: "data/arz-XX/*.parquet" - config_name: "as-US" data_files: - split: train path: "data/as-US/*.parquet" - config_name: "as-XX" data_files: - split: train path: "data/as-XX/*.parquet" - config_name: "ast-XX" data_files: - split: train path: "data/ast-XX/*.parquet" - config_name: "av-XX" data_files: - split: train path: "data/av-XX/*.parquet" - config_name: "az-AL" data_files: - split: train path: "data/az-AL/*.parquet" - config_name: "az-AR" data_files: - split: train path: "data/az-AR/*.parquet" - config_name: "az-AT" data_files: - split: train path: "data/az-AT/*.parquet" - config_name: "az-AU" data_files: - split: train path: "data/az-AU/*.parquet" - config_name: "az-AZ" data_files: - split: train path: "data/az-AZ/*.parquet" - config_name: "az-BA" data_files: - split: train path: "data/az-BA/*.parquet" - config_name: "az-BR" data_files: - split: train path: "data/az-BR/*.parquet" - config_name: "az-CA" data_files: - split: train path: "data/az-CA/*.parquet" - config_name: "az-CH" data_files: - split: train path: "data/az-CH/*.parquet" - config_name: "az-CL" data_files: - split: train path: "data/az-CL/*.parquet" - config_name: "az-CO" data_files: - split: train path: "data/az-CO/*.parquet" - config_name: "az-CZ" data_files: - split: train path: "data/az-CZ/*.parquet" - config_name: "az-DE" data_files: - split: train path: "data/az-DE/*.parquet" - config_name: "az-DO" data_files: - split: train path: "data/az-DO/*.parquet" - config_name: "az-EG" data_files: - split: train path: "data/az-EG/*.parquet" - config_name: "az-ES" data_files: - split: train path: "data/az-ES/*.parquet" - config_name: "az-EU" data_files: - split: train path: "data/az-EU/*.parquet" - config_name: "az-FM" data_files: - split: train path: "data/az-FM/*.parquet" - config_name: "az-FR" data_files: - split: train path: "data/az-FR/*.parquet" - config_name: "az-GB" data_files: - split: train path: "data/az-GB/*.parquet" - config_name: "az-GE" data_files: - split: train path: "data/az-GE/*.parquet" - config_name: "az-GR" data_files: - split: train path: "data/az-GR/*.parquet" - config_name: "az-HR" data_files: - split: train path: "data/az-HR/*.parquet" - config_name: "az-HU" data_files: - split: train path: "data/az-HU/*.parquet" - config_name: "az-ID" data_files: - split: train path: "data/az-ID/*.parquet" - config_name: "az-IL" data_files: - split: train path: "data/az-IL/*.parquet" - config_name: "az-IN" data_files: - split: train path: "data/az-IN/*.parquet" - config_name: "az-IO" data_files: - split: train path: "data/az-IO/*.parquet" - config_name: "az-IR" data_files: - split: train path: "data/az-IR/*.parquet" - config_name: "az-IS" data_files: - split: train path: "data/az-IS/*.parquet" - config_name: "az-IT" data_files: - split: train path: "data/az-IT/*.parquet" - config_name: "az-JP" data_files: - split: train path: "data/az-JP/*.parquet" - config_name: "az-KR" data_files: - split: train path: "data/az-KR/*.parquet" - config_name: "az-KZ" data_files: - split: train path: "data/az-KZ/*.parquet" - config_name: "az-ME" data_files: - split: train path: "data/az-ME/*.parquet" - config_name: "az-MX" data_files: - split: train path: "data/az-MX/*.parquet" - config_name: "az-NL" data_files: - split: train path: "data/az-NL/*.parquet" - config_name: "az-PE" data_files: - split: train path: "data/az-PE/*.parquet" - config_name: "az-PL" data_files: - split: train path: "data/az-PL/*.parquet" - config_name: "az-PM" data_files: - split: train path: "data/az-PM/*.parquet" - config_name: "az-PT" data_files: - split: train path: "data/az-PT/*.parquet" - config_name: "az-PY" data_files: - split: train path: "data/az-PY/*.parquet" - config_name: "az-RO" data_files: - split: train path: "data/az-RO/*.parquet" - config_name: "az-RU" data_files: - split: train path: "data/az-RU/*.parquet" - config_name: "az-SA" data_files: - split: train path: "data/az-SA/*.parquet" - config_name: "az-SE" data_files: - split: train path: "data/az-SE/*.parquet" - config_name: "az-SI" data_files: - split: train path: "data/az-SI/*.parquet" - config_name: "az-SK" data_files: - split: train path: "data/az-SK/*.parquet" - config_name: "az-TN" data_files: - split: train path: "data/az-TN/*.parquet" - config_name: "az-TO" data_files: - split: train path: "data/az-TO/*.parquet" - config_name: "az-TR" data_files: - split: train path: "data/az-TR/*.parquet" - config_name: "az-TV" data_files: - split: train path: "data/az-TV/*.parquet" - config_name: "az-UA" data_files: - split: train path: "data/az-UA/*.parquet" - config_name: "az-US" data_files: - split: train path: "data/az-US/*.parquet" - config_name: "az-UY" data_files: - split: train path: "data/az-UY/*.parquet" - config_name: "az-VN" data_files: - split: train path: "data/az-VN/*.parquet" - config_name: "az-WS" data_files: - split: train path: "data/az-WS/*.parquet" - config_name: "az-XX" data_files: - split: train path: "data/az-XX/*.parquet" - config_name: "azb-AZ" data_files: - split: train path: "data/azb-AZ/*.parquet" - config_name: "azb-IR" data_files: - split: train path: "data/azb-IR/*.parquet" - config_name: "azb-TR" data_files: - split: train path: "data/azb-TR/*.parquet" - config_name: "azb-US" data_files: - split: train path: "data/azb-US/*.parquet" - config_name: "azb-XX" data_files: - split: train path: "data/azb-XX/*.parquet" - config_name: "ba-BA" data_files: - split: train path: "data/ba-BA/*.parquet" - config_name: "ba-EU" data_files: - split: train path: "data/ba-EU/*.parquet" - config_name: "ba-RU" data_files: - split: train path: "data/ba-RU/*.parquet" - config_name: "ba-TV" data_files: - split: train path: "data/ba-TV/*.parquet" - config_name: "ba-XX" data_files: - split: train path: "data/ba-XX/*.parquet" - config_name: "be-BE" data_files: - split: train path: "data/be-BE/*.parquet" - config_name: "be-BY" data_files: - split: train path: "data/be-BY/*.parquet" - config_name: "be-CC" data_files: - split: train path: "data/be-CC/*.parquet" - config_name: "be-CN" data_files: - split: train path: "data/be-CN/*.parquet" - config_name: "be-CO" data_files: - split: train path: "data/be-CO/*.parquet" - config_name: "be-DE" data_files: - split: train path: "data/be-DE/*.parquet" - config_name: "be-EU" data_files: - split: train path: "data/be-EU/*.parquet" - config_name: "be-FM" data_files: - split: train path: "data/be-FM/*.parquet" - config_name: "be-FR" data_files: - split: train path: "data/be-FR/*.parquet" - config_name: "be-GB" data_files: - split: train path: "data/be-GB/*.parquet" - config_name: "be-GG" data_files: - split: train path: "data/be-GG/*.parquet" - config_name: "be-IO" data_files: - split: train path: "data/be-IO/*.parquet" - config_name: "be-IT" data_files: - split: train path: "data/be-IT/*.parquet" - config_name: "be-JP" data_files: - split: train path: "data/be-JP/*.parquet" - config_name: "be-KZ" data_files: - split: train path: "data/be-KZ/*.parquet" - config_name: "be-LT" data_files: - split: train path: "data/be-LT/*.parquet" - config_name: "be-ME" data_files: - split: train path: "data/be-ME/*.parquet" - config_name: "be-MN" data_files: - split: train path: "data/be-MN/*.parquet" - config_name: "be-PL" data_files: - split: train path: "data/be-PL/*.parquet" - config_name: "be-RU" data_files: - split: train path: "data/be-RU/*.parquet" - config_name: "be-UA" data_files: - split: train path: "data/be-UA/*.parquet" - config_name: "be-US" data_files: - split: train path: "data/be-US/*.parquet" - config_name: "be-VA" data_files: - split: train path: "data/be-VA/*.parquet" - config_name: "be-WS" data_files: - split: train path: "data/be-WS/*.parquet" - config_name: "be-XX" data_files: - split: train path: "data/be-XX/*.parquet" - config_name: "bg-AE" data_files: - split: train path: "data/bg-AE/*.parquet" - config_name: "bg-AM" data_files: - split: train path: "data/bg-AM/*.parquet" - config_name: "bg-AT" data_files: - split: train path: "data/bg-AT/*.parquet" - config_name: "bg-AU" data_files: - split: train path: "data/bg-AU/*.parquet" - config_name: "bg-BA" data_files: - split: train path: "data/bg-BA/*.parquet" - config_name: "bg-BE" data_files: - split: train path: "data/bg-BE/*.parquet" - config_name: "bg-BG" data_files: - split: train path: "data/bg-BG/*.parquet" - config_name: "bg-BQ" data_files: - split: train path: "data/bg-BQ/*.parquet" - config_name: "bg-BR" data_files: - split: train path: "data/bg-BR/*.parquet" - config_name: "bg-BU" data_files: - split: train path: "data/bg-BU/*.parquet" - config_name: "bg-BZ" data_files: - split: train path: "data/bg-BZ/*.parquet" - config_name: "bg-CC" data_files: - split: train path: "data/bg-CC/*.parquet" - config_name: "bg-CN" data_files: - split: train path: "data/bg-CN/*.parquet" - config_name: "bg-CO" data_files: - split: train path: "data/bg-CO/*.parquet" - config_name: "bg-CZ" data_files: - split: train path: "data/bg-CZ/*.parquet" - config_name: "bg-DE" data_files: - split: train path: "data/bg-DE/*.parquet" - config_name: "bg-DK" data_files: - split: train path: "data/bg-DK/*.parquet" - config_name: "bg-DO" data_files: - split: train path: "data/bg-DO/*.parquet" - config_name: "bg-EN" data_files: - split: train path: "data/bg-EN/*.parquet" - config_name: "bg-ES" data_files: - split: train path: "data/bg-ES/*.parquet" - config_name: "bg-EU" data_files: - split: train path: "data/bg-EU/*.parquet" - config_name: "bg-FM" data_files: - split: train path: "data/bg-FM/*.parquet" - config_name: "bg-FR" data_files: - split: train path: "data/bg-FR/*.parquet" - config_name: "bg-GB" data_files: - split: train path: "data/bg-GB/*.parquet" - config_name: "bg-GR" data_files: - split: train path: "data/bg-GR/*.parquet" - config_name: "bg-HU" data_files: - split: train path: "data/bg-HU/*.parquet" - config_name: "bg-IN" data_files: - split: train path: "data/bg-IN/*.parquet" - config_name: "bg-IO" data_files: - split: train path: "data/bg-IO/*.parquet" - config_name: "bg-IT" data_files: - split: train path: "data/bg-IT/*.parquet" - config_name: "bg-JP" data_files: - split: train path: "data/bg-JP/*.parquet" - config_name: "bg-LI" data_files: - split: train path: "data/bg-LI/*.parquet" - config_name: "bg-ME" data_files: - split: train path: "data/bg-ME/*.parquet" - config_name: "bg-MK" data_files: - split: train path: "data/bg-MK/*.parquet" - config_name: "bg-ML" data_files: - split: train path: "data/bg-ML/*.parquet" - config_name: "bg-NL" data_files: - split: train path: "data/bg-NL/*.parquet" - config_name: "bg-NZ" data_files: - split: train path: "data/bg-NZ/*.parquet" - config_name: "bg-PL" data_files: - split: train path: "data/bg-PL/*.parquet" - config_name: "bg-PT" data_files: - split: train path: "data/bg-PT/*.parquet" - config_name: "bg-PW" data_files: - split: train path: "data/bg-PW/*.parquet" - config_name: "bg-RO" data_files: - split: train path: "data/bg-RO/*.parquet" - config_name: "bg-RS" data_files: - split: train path: "data/bg-RS/*.parquet" - config_name: "bg-RU" data_files: - split: train path: "data/bg-RU/*.parquet" - config_name: "bg-SE" data_files: - split: train path: "data/bg-SE/*.parquet" - config_name: "bg-SI" data_files: - split: train path: "data/bg-SI/*.parquet" - config_name: "bg-SK" data_files: - split: train path: "data/bg-SK/*.parquet" - config_name: "bg-TK" data_files: - split: train path: "data/bg-TK/*.parquet" - config_name: "bg-TO" data_files: - split: train path: "data/bg-TO/*.parquet" - config_name: "bg-TR" data_files: - split: train path: "data/bg-TR/*.parquet" - config_name: "bg-TV" data_files: - split: train path: "data/bg-TV/*.parquet" - config_name: "bg-UA" data_files: - split: train path: "data/bg-UA/*.parquet" - config_name: "bg-US" data_files: - split: train path: "data/bg-US/*.parquet" - config_name: "bg-VA" data_files: - split: train path: "data/bg-VA/*.parquet" - config_name: "bg-WS" data_files: - split: train path: "data/bg-WS/*.parquet" - config_name: "bg-XX" data_files: - split: train path: "data/bg-XX/*.parquet" - config_name: "bn-BD" data_files: - split: train path: "data/bn-BD/*.parquet" - config_name: "bn-BR" data_files: - split: train path: "data/bn-BR/*.parquet" - config_name: "bn-CN" data_files: - split: train path: "data/bn-CN/*.parquet" - config_name: "bn-CO" data_files: - split: train path: "data/bn-CO/*.parquet" - config_name: "bn-DE" data_files: - split: train path: "data/bn-DE/*.parquet" - config_name: "bn-EU" data_files: - split: train path: "data/bn-EU/*.parquet" - config_name: "bn-GB" data_files: - split: train path: "data/bn-GB/*.parquet" - config_name: "bn-IN" data_files: - split: train path: "data/bn-IN/*.parquet" - config_name: "bn-TW" data_files: - split: train path: "data/bn-TW/*.parquet" - config_name: "bn-US" data_files: - split: train path: "data/bn-US/*.parquet" - config_name: "bn-XX" data_files: - split: train path: "data/bn-XX/*.parquet" - config_name: "bo-BT" data_files: - split: train path: "data/bo-BT/*.parquet" - config_name: "bo-GB" data_files: - split: train path: "data/bo-GB/*.parquet" - config_name: "bo-US" data_files: - split: train path: "data/bo-US/*.parquet" - config_name: "bo-XX" data_files: - split: train path: "data/bo-XX/*.parquet" - config_name: "br-BR" data_files: - split: train path: "data/br-BR/*.parquet" - config_name: "br-BZ" data_files: - split: train path: "data/br-BZ/*.parquet" - config_name: "br-FR" data_files: - split: train path: "data/br-FR/*.parquet" - config_name: "br-GB" data_files: - split: train path: "data/br-GB/*.parquet" - config_name: "br-XX" data_files: - split: train path: "data/br-XX/*.parquet" - config_name: "bs-IT" data_files: - split: train path: "data/bs-IT/*.parquet" - config_name: "ca-AD" data_files: - split: train path: "data/ca-AD/*.parquet" - config_name: "ca-AE" data_files: - split: train path: "data/ca-AE/*.parquet" - config_name: "ca-AR" data_files: - split: train path: "data/ca-AR/*.parquet" - config_name: "ca-AU" data_files: - split: train path: "data/ca-AU/*.parquet" - config_name: "ca-CA" data_files: - split: train path: "data/ca-CA/*.parquet" - config_name: "ca-CC" data_files: - split: train path: "data/ca-CC/*.parquet" - config_name: "ca-CH" data_files: - split: train path: "data/ca-CH/*.parquet" - config_name: "ca-CN" data_files: - split: train path: "data/ca-CN/*.parquet" - config_name: "ca-CO" data_files: - split: train path: "data/ca-CO/*.parquet" - config_name: "ca-CR" data_files: - split: train path: "data/ca-CR/*.parquet" - config_name: "ca-CZ" data_files: - split: train path: "data/ca-CZ/*.parquet" - config_name: "ca-DE" data_files: - split: train path: "data/ca-DE/*.parquet" - config_name: "ca-EN" data_files: - split: train path: "data/ca-EN/*.parquet" - config_name: "ca-ES" data_files: - split: train path: "data/ca-ES/*.parquet" - config_name: "ca-EU" data_files: - split: train path: "data/ca-EU/*.parquet" - config_name: "ca-FM" data_files: - split: train path: "data/ca-FM/*.parquet" - config_name: "ca-FR" data_files: - split: train path: "data/ca-FR/*.parquet" - config_name: "ca-GB" data_files: - split: train path: "data/ca-GB/*.parquet" - config_name: "ca-IO" data_files: - split: train path: "data/ca-IO/*.parquet" - config_name: "ca-IT" data_files: - split: train path: "data/ca-IT/*.parquet" - config_name: "ca-JP" data_files: - split: train path: "data/ca-JP/*.parquet" - config_name: "ca-KR" data_files: - split: train path: "data/ca-KR/*.parquet" - config_name: "ca-LA" data_files: - split: train path: "data/ca-LA/*.parquet" - config_name: "ca-LY" data_files: - split: train path: "data/ca-LY/*.parquet" - config_name: "ca-ME" data_files: - split: train path: "data/ca-ME/*.parquet" - config_name: "ca-MV" data_files: - split: train path: "data/ca-MV/*.parquet" - config_name: "ca-PT" data_files: - split: train path: "data/ca-PT/*.parquet" - config_name: "ca-RU" data_files: - split: train path: "data/ca-RU/*.parquet" - config_name: "ca-SE" data_files: - split: train path: "data/ca-SE/*.parquet" - config_name: "ca-SP" data_files: - split: train path: "data/ca-SP/*.parquet" - config_name: "ca-TV" data_files: - split: train path: "data/ca-TV/*.parquet" - config_name: "ca-US" data_files: - split: train path: "data/ca-US/*.parquet" - config_name: "ca-VA" data_files: - split: train path: "data/ca-VA/*.parquet" - config_name: "ca-VL" data_files: - split: train path: "data/ca-VL/*.parquet" - config_name: "ca-WS" data_files: - split: train path: "data/ca-WS/*.parquet" - config_name: "ca-XX" data_files: - split: train path: "data/ca-XX/*.parquet" - config_name: "ce-RU" data_files: - split: train path: "data/ce-RU/*.parquet" - config_name: "ce-XX" data_files: - split: train path: "data/ce-XX/*.parquet" - config_name: "ceb-GB" data_files: - split: train path: "data/ceb-GB/*.parquet" - config_name: "ceb-ID" data_files: - split: train path: "data/ceb-ID/*.parquet" - config_name: "ceb-IS" data_files: - split: train path: "data/ceb-IS/*.parquet" - config_name: "ceb-PH" data_files: - split: train path: "data/ceb-PH/*.parquet" - config_name: "ceb-US" data_files: - split: train path: "data/ceb-US/*.parquet" - config_name: "ceb-XX" data_files: - split: train path: "data/ceb-XX/*.parquet" - config_name: "ckb-AA" data_files: - split: train path: "data/ckb-AA/*.parquet" - config_name: "ckb-CO" data_files: - split: train path: "data/ckb-CO/*.parquet" - config_name: "ckb-DE" data_files: - split: train path: "data/ckb-DE/*.parquet" - config_name: "ckb-DK" data_files: - split: train path: "data/ckb-DK/*.parquet" - config_name: "ckb-GB" data_files: - split: train path: "data/ckb-GB/*.parquet" - config_name: "ckb-GP" data_files: - split: train path: "data/ckb-GP/*.parquet" - config_name: "ckb-IO" data_files: - split: train path: "data/ckb-IO/*.parquet" - config_name: "ckb-IQ" data_files: - split: train path: "data/ckb-IQ/*.parquet" - config_name: "ckb-IR" data_files: - split: train path: "data/ckb-IR/*.parquet" - config_name: "ckb-SE" data_files: - split: train path: "data/ckb-SE/*.parquet" - config_name: "ckb-SH" data_files: - split: train path: "data/ckb-SH/*.parquet" - config_name: "ckb-SK" data_files: - split: train path: "data/ckb-SK/*.parquet" - config_name: "ckb-SO" data_files: - split: train path: "data/ckb-SO/*.parquet" - config_name: "ckb-TR" data_files: - split: train path: "data/ckb-TR/*.parquet" - config_name: "ckb-TV" data_files: - split: train path: "data/ckb-TV/*.parquet" - config_name: "ckb-US" data_files: - split: train path: "data/ckb-US/*.parquet" - config_name: "ckb-XX" data_files: - split: train path: "data/ckb-XX/*.parquet" - config_name: "cs-AE" data_files: - split: train path: "data/cs-AE/*.parquet" - config_name: "cs-AG" data_files: - split: train path: "data/cs-AG/*.parquet" - config_name: "cs-AI" data_files: - split: train path: "data/cs-AI/*.parquet" - config_name: "cs-AM" data_files: - split: train path: "data/cs-AM/*.parquet" - config_name: "cs-AR" data_files: - split: train path: "data/cs-AR/*.parquet" - config_name: "cs-AS" data_files: - split: train path: "data/cs-AS/*.parquet" - config_name: "cs-AT" data_files: - split: train path: "data/cs-AT/*.parquet" - config_name: "cs-AU" data_files: - split: train path: "data/cs-AU/*.parquet" - config_name: "cs-AZ" data_files: - split: train path: "data/cs-AZ/*.parquet" - config_name: "cs-BE" data_files: - split: train path: "data/cs-BE/*.parquet" - config_name: "cs-BG" data_files: - split: train path: "data/cs-BG/*.parquet" - config_name: "cs-BR" data_files: - split: train path: "data/cs-BR/*.parquet" - config_name: "cs-BZ" data_files: - split: train path: "data/cs-BZ/*.parquet" - config_name: "cs-CA" data_files: - split: train path: "data/cs-CA/*.parquet" - config_name: "cs-CC" data_files: - split: train path: "data/cs-CC/*.parquet" - config_name: "cs-CD" data_files: - split: train path: "data/cs-CD/*.parquet" - config_name: "cs-CH" data_files: - split: train path: "data/cs-CH/*.parquet" - config_name: "cs-CL" data_files: - split: train path: "data/cs-CL/*.parquet" - config_name: "cs-CN" data_files: - split: train path: "data/cs-CN/*.parquet" - config_name: "cs-CO" data_files: - split: train path: "data/cs-CO/*.parquet" - config_name: "cs-CS" data_files: - split: train path: "data/cs-CS/*.parquet" - config_name: "cs-CX" data_files: - split: train path: "data/cs-CX/*.parquet" - config_name: "cs-CY" data_files: - split: train path: "data/cs-CY/*.parquet" - config_name: "cs-CZ" data_files: - split: train path: "data/cs-CZ/*.parquet" - config_name: "cs-DE" data_files: - split: train path: "data/cs-DE/*.parquet" - config_name: "cs-DK" data_files: - split: train path: "data/cs-DK/*.parquet" - config_name: "cs-DO" data_files: - split: train path: "data/cs-DO/*.parquet" - config_name: "cs-EE" data_files: - split: train path: "data/cs-EE/*.parquet" - config_name: "cs-EN" data_files: - split: train path: "data/cs-EN/*.parquet" - config_name: "cs-ES" data_files: - split: train path: "data/cs-ES/*.parquet" - config_name: "cs-EU" data_files: - split: train path: "data/cs-EU/*.parquet" - config_name: "cs-FI" data_files: - split: train path: "data/cs-FI/*.parquet" - config_name: "cs-FM" data_files: - split: train path: "data/cs-FM/*.parquet" - config_name: "cs-FR" data_files: - split: train path: "data/cs-FR/*.parquet" - config_name: "cs-GB" data_files: - split: train path: "data/cs-GB/*.parquet" - config_name: "cs-GG" data_files: - split: train path: "data/cs-GG/*.parquet" - config_name: "cs-GL" data_files: - split: train path: "data/cs-GL/*.parquet" - config_name: "cs-GQ" data_files: - split: train path: "data/cs-GQ/*.parquet" - config_name: "cs-GR" data_files: - split: train path: "data/cs-GR/*.parquet" - config_name: "cs-HK" data_files: - split: train path: "data/cs-HK/*.parquet" - config_name: "cs-HR" data_files: - split: train path: "data/cs-HR/*.parquet" - config_name: "cs-HU" data_files: - split: train path: "data/cs-HU/*.parquet" - config_name: "cs-ID" data_files: - split: train path: "data/cs-ID/*.parquet" - config_name: "cs-IE" data_files: - split: train path: "data/cs-IE/*.parquet" - config_name: "cs-IL" data_files: - split: train path: "data/cs-IL/*.parquet" - config_name: "cs-IN" data_files: - split: train path: "data/cs-IN/*.parquet" - config_name: "cs-IO" data_files: - split: train path: "data/cs-IO/*.parquet" - config_name: "cs-IR" data_files: - split: train path: "data/cs-IR/*.parquet" - config_name: "cs-IS" data_files: - split: train path: "data/cs-IS/*.parquet" - config_name: "cs-IT" data_files: - split: train path: "data/cs-IT/*.parquet" - config_name: "cs-JP" data_files: - split: train path: "data/cs-JP/*.parquet" - config_name: "cs-KE" data_files: - split: train path: "data/cs-KE/*.parquet" - config_name: "cs-KR" data_files: - split: train path: "data/cs-KR/*.parquet" - config_name: "cs-LA" data_files: - split: train path: "data/cs-LA/*.parquet" - config_name: "cs-LT" data_files: - split: train path: "data/cs-LT/*.parquet" - config_name: "cs-LU" data_files: - split: train path: "data/cs-LU/*.parquet" - config_name: "cs-LY" data_files: - split: train path: "data/cs-LY/*.parquet" - config_name: "cs-ME" data_files: - split: train path: "data/cs-ME/*.parquet" - config_name: "cs-ML" data_files: - split: train path: "data/cs-ML/*.parquet" - config_name: "cs-MX" data_files: - split: train path: "data/cs-MX/*.parquet" - config_name: "cs-NL" data_files: - split: train path: "data/cs-NL/*.parquet" - config_name: "cs-PH" data_files: - split: train path: "data/cs-PH/*.parquet" - config_name: "cs-PL" data_files: - split: train path: "data/cs-PL/*.parquet" - config_name: "cs-PR" data_files: - split: train path: "data/cs-PR/*.parquet" - config_name: "cs-PT" data_files: - split: train path: "data/cs-PT/*.parquet" - config_name: "cs-PW" data_files: - split: train path: "data/cs-PW/*.parquet" - config_name: "cs-RE" data_files: - split: train path: "data/cs-RE/*.parquet" - config_name: "cs-RO" data_files: - split: train path: "data/cs-RO/*.parquet" - config_name: "cs-RS" data_files: - split: train path: "data/cs-RS/*.parquet" - config_name: "cs-RU" data_files: - split: train path: "data/cs-RU/*.parquet" - config_name: "cs-SE" data_files: - split: train path: "data/cs-SE/*.parquet" - config_name: "cs-SG" data_files: - split: train path: "data/cs-SG/*.parquet" - config_name: "cs-SI" data_files: - split: train path: "data/cs-SI/*.parquet" - config_name: "cs-SK" data_files: - split: train path: "data/cs-SK/*.parquet" - config_name: "cs-TK" data_files: - split: train path: "data/cs-TK/*.parquet" - config_name: "cs-TL" data_files: - split: train path: "data/cs-TL/*.parquet" - config_name: "cs-TO" data_files: - split: train path: "data/cs-TO/*.parquet" - config_name: "cs-TR" data_files: - split: train path: "data/cs-TR/*.parquet" - config_name: "cs-TV" data_files: - split: train path: "data/cs-TV/*.parquet" - config_name: "cs-TW" data_files: - split: train path: "data/cs-TW/*.parquet" - config_name: "cs-UA" data_files: - split: train path: "data/cs-UA/*.parquet" - config_name: "cs-US" data_files: - split: train path: "data/cs-US/*.parquet" - config_name: "cs-VA" data_files: - split: train path: "data/cs-VA/*.parquet" - config_name: "cs-VN" data_files: - split: train path: "data/cs-VN/*.parquet" - config_name: "cs-WS" data_files: - split: train path: "data/cs-WS/*.parquet" - config_name: "cs-XK" data_files: - split: train path: "data/cs-XK/*.parquet" - config_name: "cs-XX" data_files: - split: train path: "data/cs-XX/*.parquet" - config_name: "cs-ZA" data_files: - split: train path: "data/cs-ZA/*.parquet" - config_name: "cv-EO" data_files: - split: train path: "data/cv-EO/*.parquet" - config_name: "cv-RU" data_files: - split: train path: "data/cv-RU/*.parquet" - config_name: "cv-XX" data_files: - split: train path: "data/cv-XX/*.parquet" - config_name: "cy-AI" data_files: - split: train path: "data/cy-AI/*.parquet" - config_name: "cy-CC" data_files: - split: train path: "data/cy-CC/*.parquet" - config_name: "cy-CO" data_files: - split: train path: "data/cy-CO/*.parquet" - config_name: "cy-CY" data_files: - split: train path: "data/cy-CY/*.parquet" - config_name: "cy-DE" data_files: - split: train path: "data/cy-DE/*.parquet" - config_name: "cy-ES" data_files: - split: train path: "data/cy-ES/*.parquet" - config_name: "cy-EU" data_files: - split: train path: "data/cy-EU/*.parquet" - config_name: "cy-FM" data_files: - split: train path: "data/cy-FM/*.parquet" - config_name: "cy-GB" data_files: - split: train path: "data/cy-GB/*.parquet" - config_name: "cy-IE" data_files: - split: train path: "data/cy-IE/*.parquet" - config_name: "cy-IO" data_files: - split: train path: "data/cy-IO/*.parquet" - config_name: "cy-JP" data_files: - split: train path: "data/cy-JP/*.parquet" - config_name: "cy-TV" data_files: - split: train path: "data/cy-TV/*.parquet" - config_name: "cy-TW" data_files: - split: train path: "data/cy-TW/*.parquet" - config_name: "cy-US" data_files: - split: train path: "data/cy-US/*.parquet" - config_name: "cy-XX" data_files: - split: train path: "data/cy-XX/*.parquet" - config_name: "da-AF" data_files: - split: train path: "data/da-AF/*.parquet" - config_name: "da-AG" data_files: - split: train path: "data/da-AG/*.parquet" - config_name: "da-AI" data_files: - split: train path: "data/da-AI/*.parquet" - config_name: "da-AS" data_files: - split: train path: "data/da-AS/*.parquet" - config_name: "da-AT" data_files: - split: train path: "data/da-AT/*.parquet" - config_name: "da-AU" data_files: - split: train path: "data/da-AU/*.parquet" - config_name: "da-BE" data_files: - split: train path: "data/da-BE/*.parquet" - config_name: "da-BG" data_files: - split: train path: "data/da-BG/*.parquet" - config_name: "da-BR" data_files: - split: train path: "data/da-BR/*.parquet" - config_name: "da-BZ" data_files: - split: train path: "data/da-BZ/*.parquet" - config_name: "da-CA" data_files: - split: train path: "data/da-CA/*.parquet" - config_name: "da-CC" data_files: - split: train path: "data/da-CC/*.parquet" - config_name: "da-CH" data_files: - split: train path: "data/da-CH/*.parquet" - config_name: "da-CL" data_files: - split: train path: "data/da-CL/*.parquet" - config_name: "da-CN" data_files: - split: train path: "data/da-CN/*.parquet" - config_name: "da-CO" data_files: - split: train path: "data/da-CO/*.parquet" - config_name: "da-CZ" data_files: - split: train path: "data/da-CZ/*.parquet" - config_name: "da-DA" data_files: - split: train path: "data/da-DA/*.parquet" - config_name: "da-DE" data_files: - split: train path: "data/da-DE/*.parquet" - config_name: "da-DK" data_files: - split: train path: "data/da-DK/*.parquet" - config_name: "da-EE" data_files: - split: train path: "data/da-EE/*.parquet" - config_name: "da-EN" data_files: - split: train path: "data/da-EN/*.parquet" - config_name: "da-ES" data_files: - split: train path: "data/da-ES/*.parquet" - config_name: "da-EU" data_files: - split: train path: "data/da-EU/*.parquet" - config_name: "da-FI" data_files: - split: train path: "data/da-FI/*.parquet" - config_name: "da-FM" data_files: - split: train path: "data/da-FM/*.parquet" - config_name: "da-FO" data_files: - split: train path: "data/da-FO/*.parquet" - config_name: "da-FR" data_files: - split: train path: "data/da-FR/*.parquet" - config_name: "da-GB" data_files: - split: train path: "data/da-GB/*.parquet" - config_name: "da-GG" data_files: - split: train path: "data/da-GG/*.parquet" - config_name: "da-GL" data_files: - split: train path: "data/da-GL/*.parquet" - config_name: "da-GR" data_files: - split: train path: "data/da-GR/*.parquet" - config_name: "da-HU" data_files: - split: train path: "data/da-HU/*.parquet" - config_name: "da-IE" data_files: - split: train path: "data/da-IE/*.parquet" - config_name: "da-IL" data_files: - split: train path: "data/da-IL/*.parquet" - config_name: "da-IN" data_files: - split: train path: "data/da-IN/*.parquet" - config_name: "da-IO" data_files: - split: train path: "data/da-IO/*.parquet" - config_name: "da-IS" data_files: - split: train path: "data/da-IS/*.parquet" - config_name: "da-IT" data_files: - split: train path: "data/da-IT/*.parquet" - config_name: "da-KZ" data_files: - split: train path: "data/da-KZ/*.parquet" - config_name: "da-LU" data_files: - split: train path: "data/da-LU/*.parquet" - config_name: "da-ME" data_files: - split: train path: "data/da-ME/*.parquet" - config_name: "da-MX" data_files: - split: train path: "data/da-MX/*.parquet" - config_name: "da-NB" data_files: - split: train path: "data/da-NB/*.parquet" - config_name: "da-NE" data_files: - split: train path: "data/da-NE/*.parquet" - config_name: "da-NL" data_files: - split: train path: "data/da-NL/*.parquet" - config_name: "da-NO" data_files: - split: train path: "data/da-NO/*.parquet" - config_name: "da-NU" data_files: - split: train path: "data/da-NU/*.parquet" - config_name: "da-PE" data_files: - split: train path: "data/da-PE/*.parquet" - config_name: "da-PL" data_files: - split: train path: "data/da-PL/*.parquet" - config_name: "da-PS" data_files: - split: train path: "data/da-PS/*.parquet" - config_name: "da-PT" data_files: - split: train path: "data/da-PT/*.parquet" - config_name: "da-PW" data_files: - split: train path: "data/da-PW/*.parquet" - config_name: "da-RO" data_files: - split: train path: "data/da-RO/*.parquet" - config_name: "da-RU" data_files: - split: train path: "data/da-RU/*.parquet" - config_name: "da-SE" data_files: - split: train path: "data/da-SE/*.parquet" - config_name: "da-SI" data_files: - split: train path: "data/da-SI/*.parquet" - config_name: "da-SK" data_files: - split: train path: "data/da-SK/*.parquet" - config_name: "da-SV" data_files: - split: train path: "data/da-SV/*.parquet" - config_name: "da-SZ" data_files: - split: train path: "data/da-SZ/*.parquet" - config_name: "da-TH" data_files: - split: train path: "data/da-TH/*.parquet" - config_name: "da-TR" data_files: - split: train path: "data/da-TR/*.parquet" - config_name: "da-TV" data_files: - split: train path: "data/da-TV/*.parquet" - config_name: "da-UA" data_files: - split: train path: "data/da-UA/*.parquet" - config_name: "da-US" data_files: - split: train path: "data/da-US/*.parquet" - config_name: "da-WS" data_files: - split: train path: "data/da-WS/*.parquet" - config_name: "da-XX" data_files: - split: train path: "data/da-XX/*.parquet" - config_name: "da-ZA" data_files: - split: train path: "data/da-ZA/*.parquet" - config_name: "de-AA" data_files: - split: train path: "data/de-AA/*.parquet" - config_name: "de-AD" data_files: - split: train path: "data/de-AD/*.parquet" - config_name: "de-AE" data_files: - split: train path: "data/de-AE/*.parquet" - config_name: "de-AG" data_files: - split: train path: "data/de-AG/*.parquet" - config_name: "de-AI" data_files: - split: train path: "data/de-AI/*.parquet" - config_name: "de-AL" data_files: - split: train path: "data/de-AL/*.parquet" - config_name: "de-AM" data_files: - split: train path: "data/de-AM/*.parquet" - config_name: "de-AR" data_files: - split: train path: "data/de-AR/*.parquet" - config_name: "de-AS" data_files: - split: train path: "data/de-AS/*.parquet" - config_name: "de-AT" data_files: - split: train path: "data/de-AT/*.parquet" - config_name: "de-AU" data_files: - split: train path: "data/de-AU/*.parquet" - config_name: "de-AX" data_files: - split: train path: "data/de-AX/*.parquet" - config_name: "de-AZ" data_files: - split: train path: "data/de-AZ/*.parquet" - config_name: "de-BA" data_files: - split: train path: "data/de-BA/*.parquet" - config_name: "de-BD" data_files: - split: train path: "data/de-BD/*.parquet" - config_name: "de-BE" data_files: - split: train path: "data/de-BE/*.parquet" - config_name: "de-BG" data_files: - split: train path: "data/de-BG/*.parquet" - config_name: "de-BM" data_files: - split: train path: "data/de-BM/*.parquet" - config_name: "de-BR" data_files: - split: train path: "data/de-BR/*.parquet" - config_name: "de-BS" data_files: - split: train path: "data/de-BS/*.parquet" - config_name: "de-BY" data_files: - split: train path: "data/de-BY/*.parquet" - config_name: "de-BZ" data_files: - split: train path: "data/de-BZ/*.parquet" - config_name: "de-CA" data_files: - split: train path: "data/de-CA/*.parquet" - config_name: "de-CC" data_files: - split: train path: "data/de-CC/*.parquet" - config_name: "de-CD" data_files: - split: train path: "data/de-CD/*.parquet" - config_name: "de-CF" data_files: - split: train path: "data/de-CF/*.parquet" - config_name: "de-CH" data_files: - split: train path: "data/de-CH/*.parquet" - config_name: "de-CL" data_files: - split: train path: "data/de-CL/*.parquet" - config_name: "de-CM" data_files: - split: train path: "data/de-CM/*.parquet" - config_name: "de-CN" data_files: - split: train path: "data/de-CN/*.parquet" - config_name: "de-CO" data_files: - split: train path: "data/de-CO/*.parquet" - config_name: "de-CR" data_files: - split: train path: "data/de-CR/*.parquet" - config_name: "de-CS" data_files: - split: train path: "data/de-CS/*.parquet" - config_name: "de-CU" data_files: - split: train path: "data/de-CU/*.parquet" - config_name: "de-CX" data_files: - split: train path: "data/de-CX/*.parquet" - config_name: "de-CY" data_files: - split: train path: "data/de-CY/*.parquet" - config_name: "de-CZ" data_files: - split: train path: "data/de-CZ/*.parquet" - config_name: "de-DE" data_files: - split: train path: "data/de-DE/*.parquet" - config_name: "de-DK" data_files: - split: train path: "data/de-DK/*.parquet" - config_name: "de-DO" data_files: - split: train path: "data/de-DO/*.parquet" - config_name: "de-DS" data_files: - split: train path: "data/de-DS/*.parquet" - config_name: "de-DU" data_files: - split: train path: "data/de-DU/*.parquet" - config_name: "de-EC" data_files: - split: train path: "data/de-EC/*.parquet" - config_name: "de-EE" data_files: - split: train path: "data/de-EE/*.parquet" - config_name: "de-EG" data_files: - split: train path: "data/de-EG/*.parquet" - config_name: "de-EN" data_files: - split: train path: "data/de-EN/*.parquet" - config_name: "de-ES" data_files: - split: train path: "data/de-ES/*.parquet" - config_name: "de-ET" data_files: - split: train path: "data/de-ET/*.parquet" - config_name: "de-EU" data_files: - split: train path: "data/de-EU/*.parquet" - config_name: "de-FI" data_files: - split: train path: "data/de-FI/*.parquet" - config_name: "de-FM" data_files: - split: train path: "data/de-FM/*.parquet" - config_name: "de-FR" data_files: - split: train path: "data/de-FR/*.parquet" - config_name: "de-FY" data_files: - split: train path: "data/de-FY/*.parquet" - config_name: "de-GA" data_files: - split: train path: "data/de-GA/*.parquet" - config_name: "de-GB" data_files: - split: train path: "data/de-GB/*.parquet" - config_name: "de-GE" data_files: - split: train path: "data/de-GE/*.parquet" - config_name: "de-GG" data_files: - split: train path: "data/de-GG/*.parquet" - config_name: "de-GL" data_files: - split: train path: "data/de-GL/*.parquet" - config_name: "de-GP" data_files: - split: train path: "data/de-GP/*.parquet" - config_name: "de-GQ" data_files: - split: train path: "data/de-GQ/*.parquet" - config_name: "de-GR" data_files: - split: train path: "data/de-GR/*.parquet" - config_name: "de-GT" data_files: - split: train path: "data/de-GT/*.parquet" - config_name: "de-GX" data_files: - split: train path: "data/de-GX/*.parquet" - config_name: "de-HK" data_files: - split: train path: "data/de-HK/*.parquet" - config_name: "de-HN" data_files: - split: train path: "data/de-HN/*.parquet" - config_name: "de-HR" data_files: - split: train path: "data/de-HR/*.parquet" - config_name: "de-HU" data_files: - split: train path: "data/de-HU/*.parquet" - config_name: "de-ID" data_files: - split: train path: "data/de-ID/*.parquet" - config_name: "de-IE" data_files: - split: train path: "data/de-IE/*.parquet" - config_name: "de-IL" data_files: - split: train path: "data/de-IL/*.parquet" - config_name: "de-IM" data_files: - split: train path: "data/de-IM/*.parquet" - config_name: "de-IN" data_files: - split: train path: "data/de-IN/*.parquet" - config_name: "de-IO" data_files: - split: train path: "data/de-IO/*.parquet" - config_name: "de-IQ" data_files: - split: train path: "data/de-IQ/*.parquet" - config_name: "de-IR" data_files: - split: train path: "data/de-IR/*.parquet" - config_name: "de-IS" data_files: - split: train path: "data/de-IS/*.parquet" - config_name: "de-IT" data_files: - split: train path: "data/de-IT/*.parquet" - config_name: "de-JP" data_files: - split: train path: "data/de-JP/*.parquet" - config_name: "de-KG" data_files: - split: train path: "data/de-KG/*.parquet" - config_name: "de-KR" data_files: - split: train path: "data/de-KR/*.parquet" - config_name: "de-KZ" data_files: - split: train path: "data/de-KZ/*.parquet" - config_name: "de-LA" data_files: - split: train path: "data/de-LA/*.parquet" - config_name: "de-LB" data_files: - split: train path: "data/de-LB/*.parquet" - config_name: "de-LC" data_files: - split: train path: "data/de-LC/*.parquet" - config_name: "de-LI" data_files: - split: train path: "data/de-LI/*.parquet" - config_name: "de-LK" data_files: - split: train path: "data/de-LK/*.parquet" - config_name: "de-LS" data_files: - split: train path: "data/de-LS/*.parquet" - config_name: "de-LT" data_files: - split: train path: "data/de-LT/*.parquet" - config_name: "de-LU" data_files: - split: train path: "data/de-LU/*.parquet" - config_name: "de-LV" data_files: - split: train path: "data/de-LV/*.parquet" - config_name: "de-LY" data_files: - split: train path: "data/de-LY/*.parquet" - config_name: "de-MA" data_files: - split: train path: "data/de-MA/*.parquet" - config_name: "de-MD" data_files: - split: train path: "data/de-MD/*.parquet" - config_name: "de-ME" data_files: - split: train path: "data/de-ME/*.parquet" - config_name: "de-MG" data_files: - split: train path: "data/de-MG/*.parquet" - config_name: "de-MK" data_files: - split: train path: "data/de-MK/*.parquet" - config_name: "de-ML" data_files: - split: train path: "data/de-ML/*.parquet" - config_name: "de-MN" data_files: - split: train path: "data/de-MN/*.parquet" - config_name: "de-MS" data_files: - split: train path: "data/de-MS/*.parquet" - config_name: "de-MT" data_files: - split: train path: "data/de-MT/*.parquet" - config_name: "de-MX" data_files: - split: train path: "data/de-MX/*.parquet" - config_name: "de-MY" data_files: - split: train path: "data/de-MY/*.parquet" - config_name: "de-NA" data_files: - split: train path: "data/de-NA/*.parquet" - config_name: "de-NF" data_files: - split: train path: "data/de-NF/*.parquet" - config_name: "de-NG" data_files: - split: train path: "data/de-NG/*.parquet" - config_name: "de-NL" data_files: - split: train path: "data/de-NL/*.parquet" - config_name: "de-NO" data_files: - split: train path: "data/de-NO/*.parquet" - config_name: "de-NP" data_files: - split: train path: "data/de-NP/*.parquet" - config_name: "de-NU" data_files: - split: train path: "data/de-NU/*.parquet" - config_name: "de-NZ" data_files: - split: train path: "data/de-NZ/*.parquet" - config_name: "de-OS" data_files: - split: train path: "data/de-OS/*.parquet" - config_name: "de-PA" data_files: - split: train path: "data/de-PA/*.parquet" - config_name: "de-PE" data_files: - split: train path: "data/de-PE/*.parquet" - config_name: "de-PH" data_files: - split: train path: "data/de-PH/*.parquet" - config_name: "de-PK" data_files: - split: train path: "data/de-PK/*.parquet" - config_name: "de-PL" data_files: - split: train path: "data/de-PL/*.parquet" - config_name: "de-PM" data_files: - split: train path: "data/de-PM/*.parquet" - config_name: "de-PN" data_files: - split: train path: "data/de-PN/*.parquet" - config_name: "de-PR" data_files: - split: train path: "data/de-PR/*.parquet" - config_name: "de-PT" data_files: - split: train path: "data/de-PT/*.parquet" - config_name: "de-PW" data_files: - split: train path: "data/de-PW/*.parquet" - config_name: "de-QA" data_files: - split: train path: "data/de-QA/*.parquet" - config_name: "de-RO" data_files: - split: train path: "data/de-RO/*.parquet" - config_name: "de-RS" data_files: - split: train path: "data/de-RS/*.parquet" - config_name: "de-RT" data_files: - split: train path: "data/de-RT/*.parquet" - config_name: "de-RU" data_files: - split: train path: "data/de-RU/*.parquet" - config_name: "de-SA" data_files: - split: train path: "data/de-SA/*.parquet" - config_name: "de-SB" data_files: - split: train path: "data/de-SB/*.parquet" - config_name: "de-SC" data_files: - split: train path: "data/de-SC/*.parquet" - config_name: "de-SE" data_files: - split: train path: "data/de-SE/*.parquet" - config_name: "de-SG" data_files: - split: train path: "data/de-SG/*.parquet" - config_name: "de-SH" data_files: - split: train path: "data/de-SH/*.parquet" - config_name: "de-SI" data_files: - split: train path: "data/de-SI/*.parquet" - config_name: "de-SK" data_files: - split: train path: "data/de-SK/*.parquet" - config_name: "de-SM" data_files: - split: train path: "data/de-SM/*.parquet" - config_name: "de-SO" data_files: - split: train path: "data/de-SO/*.parquet" - config_name: "de-ST" data_files: - split: train path: "data/de-ST/*.parquet" - config_name: "de-SV" data_files: - split: train path: "data/de-SV/*.parquet" - config_name: "de-SX" data_files: - split: train path: "data/de-SX/*.parquet" - config_name: "de-TB" data_files: - split: train path: "data/de-TB/*.parquet" - config_name: "de-TC" data_files: - split: train path: "data/de-TC/*.parquet" - config_name: "de-TG" data_files: - split: train path: "data/de-TG/*.parquet" - config_name: "de-TH" data_files: - split: train path: "data/de-TH/*.parquet" - config_name: "de-TJ" data_files: - split: train path: "data/de-TJ/*.parquet" - config_name: "de-TK" data_files: - split: train path: "data/de-TK/*.parquet" - config_name: "de-TL" data_files: - split: train path: "data/de-TL/*.parquet" - config_name: "de-TM" data_files: - split: train path: "data/de-TM/*.parquet" - config_name: "de-TO" data_files: - split: train path: "data/de-TO/*.parquet" - config_name: "de-TR" data_files: - split: train path: "data/de-TR/*.parquet" - config_name: "de-TV" data_files: - split: train path: "data/de-TV/*.parquet" - config_name: "de-TW" data_files: - split: train path: "data/de-TW/*.parquet" - config_name: "de-UA" data_files: - split: train path: "data/de-UA/*.parquet" - config_name: "de-UK" data_files: - split: train path: "data/de-UK/*.parquet" - config_name: "de-US" data_files: - split: train path: "data/de-US/*.parquet" - config_name: "de-UY" data_files: - split: train path: "data/de-UY/*.parquet" - config_name: "de-UZ" data_files: - split: train path: "data/de-UZ/*.parquet" - config_name: "de-VA" data_files: - split: train path: "data/de-VA/*.parquet" - config_name: "de-VC" data_files: - split: train path: "data/de-VC/*.parquet" - config_name: "de-VE" data_files: - split: train path: "data/de-VE/*.parquet" - config_name: "de-VN" data_files: - split: train path: "data/de-VN/*.parquet" - config_name: "de-VU" data_files: - split: train path: "data/de-VU/*.parquet" - config_name: "de-WS" data_files: - split: train path: "data/de-WS/*.parquet" - config_name: "de-XX" data_files: - split: train path: "data/de-XX/*.parquet" - config_name: "de-ZA" data_files: - split: train path: "data/de-ZA/*.parquet" - config_name: "de-ZW" data_files: - split: train path: "data/de-ZW/*.parquet" - config_name: "de-ZZ" data_files: - split: train path: "data/de-ZZ/*.parquet" - config_name: "el-AI" data_files: - split: train path: "data/el-AI/*.parquet" - config_name: "el-AL" data_files: - split: train path: "data/el-AL/*.parquet" - config_name: "el-AM" data_files: - split: train path: "data/el-AM/*.parquet" - config_name: "el-AR" data_files: - split: train path: "data/el-AR/*.parquet" - config_name: "el-AU" data_files: - split: train path: "data/el-AU/*.parquet" - config_name: "el-BE" data_files: - split: train path: "data/el-BE/*.parquet" - config_name: "el-BG" data_files: - split: train path: "data/el-BG/*.parquet" - config_name: "el-BM" data_files: - split: train path: "data/el-BM/*.parquet" - config_name: "el-BO" data_files: - split: train path: "data/el-BO/*.parquet" - config_name: "el-BR" data_files: - split: train path: "data/el-BR/*.parquet" - config_name: "el-BZ" data_files: - split: train path: "data/el-BZ/*.parquet" - config_name: "el-CA" data_files: - split: train path: "data/el-CA/*.parquet" - config_name: "el-CC" data_files: - split: train path: "data/el-CC/*.parquet" - config_name: "el-CF" data_files: - split: train path: "data/el-CF/*.parquet" - config_name: "el-CH" data_files: - split: train path: "data/el-CH/*.parquet" - config_name: "el-CN" data_files: - split: train path: "data/el-CN/*.parquet" - config_name: "el-CO" data_files: - split: train path: "data/el-CO/*.parquet" - config_name: "el-CY" data_files: - split: train path: "data/el-CY/*.parquet" - config_name: "el-CZ" data_files: - split: train path: "data/el-CZ/*.parquet" - config_name: "el-DE" data_files: - split: train path: "data/el-DE/*.parquet" - config_name: "el-DJ" data_files: - split: train path: "data/el-DJ/*.parquet" - config_name: "el-DK" data_files: - split: train path: "data/el-DK/*.parquet" - config_name: "el-EL" data_files: - split: train path: "data/el-EL/*.parquet" - config_name: "el-EN" data_files: - split: train path: "data/el-EN/*.parquet" - config_name: "el-ES" data_files: - split: train path: "data/el-ES/*.parquet" - config_name: "el-EU" data_files: - split: train path: "data/el-EU/*.parquet" - config_name: "el-FI" data_files: - split: train path: "data/el-FI/*.parquet" - config_name: "el-FM" data_files: - split: train path: "data/el-FM/*.parquet" - config_name: "el-FR" data_files: - split: train path: "data/el-FR/*.parquet" - config_name: "el-GA" data_files: - split: train path: "data/el-GA/*.parquet" - config_name: "el-GB" data_files: - split: train path: "data/el-GB/*.parquet" - config_name: "el-GQ" data_files: - split: train path: "data/el-GQ/*.parquet" - config_name: "el-GR" data_files: - split: train path: "data/el-GR/*.parquet" - config_name: "el-HR" data_files: - split: train path: "data/el-HR/*.parquet" - config_name: "el-IL" data_files: - split: train path: "data/el-IL/*.parquet" - config_name: "el-IN" data_files: - split: train path: "data/el-IN/*.parquet" - config_name: "el-IO" data_files: - split: train path: "data/el-IO/*.parquet" - config_name: "el-IR" data_files: - split: train path: "data/el-IR/*.parquet" - config_name: "el-IS" data_files: - split: train path: "data/el-IS/*.parquet" - config_name: "el-IT" data_files: - split: train path: "data/el-IT/*.parquet" - config_name: "el-LY" data_files: - split: train path: "data/el-LY/*.parquet" - config_name: "el-ME" data_files: - split: train path: "data/el-ME/*.parquet" - config_name: "el-MK" data_files: - split: train path: "data/el-MK/*.parquet" - config_name: "el-ML" data_files: - split: train path: "data/el-ML/*.parquet" - config_name: "el-MX" data_files: - split: train path: "data/el-MX/*.parquet" - config_name: "el-NL" data_files: - split: train path: "data/el-NL/*.parquet" - config_name: "el-NO" data_files: - split: train path: "data/el-NO/*.parquet" - config_name: "el-NZ" data_files: - split: train path: "data/el-NZ/*.parquet" - config_name: "el-PL" data_files: - split: train path: "data/el-PL/*.parquet" - config_name: "el-PM" data_files: - split: train path: "data/el-PM/*.parquet" - config_name: "el-PO" data_files: - split: train path: "data/el-PO/*.parquet" - config_name: "el-PT" data_files: - split: train path: "data/el-PT/*.parquet" - config_name: "el-RO" data_files: - split: train path: "data/el-RO/*.parquet" - config_name: "el-RS" data_files: - split: train path: "data/el-RS/*.parquet" - config_name: "el-RU" data_files: - split: train path: "data/el-RU/*.parquet" - config_name: "el-SE" data_files: - split: train path: "data/el-SE/*.parquet" - config_name: "el-SG" data_files: - split: train path: "data/el-SG/*.parquet" - config_name: "el-SK" data_files: - split: train path: "data/el-SK/*.parquet" - config_name: "el-ST" data_files: - split: train path: "data/el-ST/*.parquet" - config_name: "el-TH" data_files: - split: train path: "data/el-TH/*.parquet" - config_name: "el-TK" data_files: - split: train path: "data/el-TK/*.parquet" - config_name: "el-TL" data_files: - split: train path: "data/el-TL/*.parquet" - config_name: "el-TO" data_files: - split: train path: "data/el-TO/*.parquet" - config_name: "el-TR" data_files: - split: train path: "data/el-TR/*.parquet" - config_name: "el-TV" data_files: - split: train path: "data/el-TV/*.parquet" - config_name: "el-TW" data_files: - split: train path: "data/el-TW/*.parquet" - config_name: "el-UA" data_files: - split: train path: "data/el-UA/*.parquet" - config_name: "el-US" data_files: - split: train path: "data/el-US/*.parquet" - config_name: "el-WS" data_files: - split: train path: "data/el-WS/*.parquet" - config_name: "el-XX" data_files: - split: train path: "data/el-XX/*.parquet" - config_name: "en-AA" data_files: - split: train path: "data/en-AA/*.parquet" - config_name: "en-AC" data_files: - split: train path: "data/en-AC/*.parquet" - config_name: "en-AD" data_files: - split: train path: "data/en-AD/*.parquet" - config_name: "en-AE" data_files: - split: train path: "data/en-AE/*.parquet" - config_name: "en-AF" data_files: - split: train path: "data/en-AF/*.parquet" - config_name: "en-AG" data_files: - split: train path: "data/en-AG/*.parquet" - config_name: "en-AI" data_files: - split: train path: "data/en-AI/*.parquet" - config_name: "en-AL" data_files: - split: train path: "data/en-AL/*.parquet" - config_name: "en-AM" data_files: - split: train path: "data/en-AM/*.parquet" - config_name: "en-AN" data_files: - split: train path: "data/en-AN/*.parquet" - config_name: "en-AO" data_files: - split: train path: "data/en-AO/*.parquet" - config_name: "en-AP" data_files: - split: train path: "data/en-AP/*.parquet" - config_name: "en-AQ" data_files: - split: train path: "data/en-AQ/*.parquet" - config_name: "en-AR" data_files: - split: train path: "data/en-AR/*.parquet" - config_name: "en-AS" data_files: - split: train path: "data/en-AS/*.parquet" - config_name: "en-AT" data_files: - split: train path: "data/en-AT/*.parquet" - config_name: "en-AU" data_files: - split: train path: "data/en-AU/*.parquet" - config_name: "en-AW" data_files: - split: train path: "data/en-AW/*.parquet" - config_name: "en-AX" data_files: - split: train path: "data/en-AX/*.parquet" - config_name: "en-AZ" data_files: - split: train path: "data/en-AZ/*.parquet" - config_name: "en-BA" data_files: - split: train path: "data/en-BA/*.parquet" - config_name: "en-BB" data_files: - split: train path: "data/en-BB/*.parquet" - config_name: "en-BD" data_files: - split: train path: "data/en-BD/*.parquet" - config_name: "en-BE" data_files: - split: train path: "data/en-BE/*.parquet" - config_name: "en-BF" data_files: - split: train path: "data/en-BF/*.parquet" - config_name: "en-BG" data_files: - split: train path: "data/en-BG/*.parquet" - config_name: "en-BH" data_files: - split: train path: "data/en-BH/*.parquet" - config_name: "en-BI" data_files: - split: train path: "data/en-BI/*.parquet" - config_name: "en-BJ" data_files: - split: train path: "data/en-BJ/*.parquet" - config_name: "en-BL" data_files: - split: train path: "data/en-BL/*.parquet" - config_name: "en-BM" data_files: - split: train path: "data/en-BM/*.parquet" - config_name: "en-BN" data_files: - split: train path: "data/en-BN/*.parquet" - config_name: "en-BO" data_files: - split: train path: "data/en-BO/*.parquet" - config_name: "en-BQ" data_files: - split: train path: "data/en-BQ/*.parquet" - config_name: "en-BR" data_files: - split: train path: "data/en-BR/*.parquet" - config_name: "en-BS" data_files: - split: train path: "data/en-BS/*.parquet" - config_name: "en-BT" data_files: - split: train path: "data/en-BT/*.parquet" - config_name: "en-BW" data_files: - split: train path: "data/en-BW/*.parquet" - config_name: "en-BY" data_files: - split: train path: "data/en-BY/*.parquet" - config_name: "en-BZ" data_files: - split: train path: "data/en-BZ/*.parquet" - config_name: "en-CA" data_files: - split: train path: "data/en-CA/*.parquet" - config_name: "en-CB" data_files: - split: train path: "data/en-CB/*.parquet" - config_name: "en-CC" data_files: - split: train path: "data/en-CC/*.parquet" - config_name: "en-CD" data_files: - split: train path: "data/en-CD/*.parquet" - config_name: "en-CE" data_files: - split: train path: "data/en-CE/*.parquet" - config_name: "en-CF" data_files: - split: train path: "data/en-CF/*.parquet" - config_name: "en-CG" data_files: - split: train path: "data/en-CG/*.parquet" - config_name: "en-CH" data_files: - split: train path: "data/en-CH/*.parquet" - config_name: "en-CI" data_files: - split: train path: "data/en-CI/*.parquet" - config_name: "en-CK" data_files: - split: train path: "data/en-CK/*.parquet" - config_name: "en-CL" data_files: - split: train path: "data/en-CL/*.parquet" - config_name: "en-CM" data_files: - split: train path: "data/en-CM/*.parquet" - config_name: "en-CN" data_files: - split: train path: "data/en-CN/*.parquet" - config_name: "en-CO" data_files: - split: train path: "data/en-CO/*.parquet" - config_name: "en-CR" data_files: - split: train path: "data/en-CR/*.parquet" - config_name: "en-CU" data_files: - split: train path: "data/en-CU/*.parquet" - config_name: "en-CV" data_files: - split: train path: "data/en-CV/*.parquet" - config_name: "en-CW" data_files: - split: train path: "data/en-CW/*.parquet" - config_name: "en-CX" data_files: - split: train path: "data/en-CX/*.parquet" - config_name: "en-CY" data_files: - split: train path: "data/en-CY/*.parquet" - config_name: "en-CZ" data_files: - split: train path: "data/en-CZ/*.parquet" - config_name: "en-DA" data_files: - split: train path: "data/en-DA/*.parquet" - config_name: "en-DE" data_files: - split: train path: "data/en-DE/*.parquet" - config_name: "en-DJ" data_files: - split: train path: "data/en-DJ/*.parquet" - config_name: "en-DK" data_files: - split: train path: "data/en-DK/*.parquet" - config_name: "en-DM" data_files: - split: train path: "data/en-DM/*.parquet" - config_name: "en-DO" data_files: - split: train path: "data/en-DO/*.parquet" - config_name: "en-DR" data_files: - split: train path: "data/en-DR/*.parquet" - config_name: "en-DX" data_files: - split: train path: "data/en-DX/*.parquet" - config_name: "en-DZ" data_files: - split: train path: "data/en-DZ/*.parquet" - config_name: "en-EC" data_files: - split: train path: "data/en-EC/*.parquet" - config_name: "en-EE" data_files: - split: train path: "data/en-EE/*.parquet" - config_name: "en-EG" data_files: - split: train path: "data/en-EG/*.parquet" - config_name: "en-EH" data_files: - split: train path: "data/en-EH/*.parquet" - config_name: "en-EI" data_files: - split: train path: "data/en-EI/*.parquet" - config_name: "en-EM" data_files: - split: train path: "data/en-EM/*.parquet" - config_name: "en-EN" data_files: - split: train path: "data/en-EN/*.parquet" - config_name: "en-ES" data_files: - split: train path: "data/en-ES/*.parquet" - config_name: "en-ET" data_files: - split: train path: "data/en-ET/*.parquet" - config_name: "en-EU" data_files: - split: train path: "data/en-EU/*.parquet" - config_name: "en-EX" data_files: - split: train path: "data/en-EX/*.parquet" - config_name: "en-FA" data_files: - split: train path: "data/en-FA/*.parquet" - config_name: "en-FI" data_files: - split: train path: "data/en-FI/*.parquet" - config_name: "en-FJ" data_files: - split: train path: "data/en-FJ/*.parquet" - config_name: "en-FL" data_files: - split: train path: "data/en-FL/*.parquet" - config_name: "en-FM" data_files: - split: train path: "data/en-FM/*.parquet" - config_name: "en-FO" data_files: - split: train path: "data/en-FO/*.parquet" - config_name: "en-FP" data_files: - split: train path: "data/en-FP/*.parquet" - config_name: "en-FR" data_files: - split: train path: "data/en-FR/*.parquet" - config_name: "en-GA" data_files: - split: train path: "data/en-GA/*.parquet" - config_name: "en-GB" data_files: - split: train path: "data/en-GB/*.parquet" - config_name: "en-GD" data_files: - split: train path: "data/en-GD/*.parquet" - config_name: "en-GE" data_files: - split: train path: "data/en-GE/*.parquet" - config_name: "en-GF" data_files: - split: train path: "data/en-GF/*.parquet" - config_name: "en-GG" data_files: - split: train path: "data/en-GG/*.parquet" - config_name: "en-GH" data_files: - split: train path: "data/en-GH/*.parquet" - config_name: "en-GI" data_files: - split: train path: "data/en-GI/*.parquet" - config_name: "en-GL" data_files: - split: train path: "data/en-GL/*.parquet" - config_name: "en-GM" data_files: - split: train path: "data/en-GM/*.parquet" - config_name: "en-GN" data_files: - split: train path: "data/en-GN/*.parquet" - config_name: "en-GO" data_files: - split: train path: "data/en-GO/*.parquet" - config_name: "en-GP" data_files: - split: train path: "data/en-GP/*.parquet" - config_name: "en-GQ" data_files: - split: train path: "data/en-GQ/*.parquet" - config_name: "en-GR" data_files: - split: train path: "data/en-GR/*.parquet" - config_name: "en-GS" data_files: - split: train path: "data/en-GS/*.parquet" - config_name: "en-GT" data_files: - split: train path: "data/en-GT/*.parquet" - config_name: "en-GU" data_files: - split: train path: "data/en-GU/*.parquet" - config_name: "en-GW" data_files: - split: train path: "data/en-GW/*.parquet" - config_name: "en-GX" data_files: - split: train path: "data/en-GX/*.parquet" - config_name: "en-GY" data_files: - split: train path: "data/en-GY/*.parquet" - config_name: "en-HE" data_files: - split: train path: "data/en-HE/*.parquet" - config_name: "en-HK" data_files: - split: train path: "data/en-HK/*.parquet" - config_name: "en-HN" data_files: - split: train path: "data/en-HN/*.parquet" - config_name: "en-HQ" data_files: - split: train path: "data/en-HQ/*.parquet" - config_name: "en-HR" data_files: - split: train path: "data/en-HR/*.parquet" - config_name: "en-HT" data_files: - split: train path: "data/en-HT/*.parquet" - config_name: "en-HU" data_files: - split: train path: "data/en-HU/*.parquet" - config_name: "en-IC" data_files: - split: train path: "data/en-IC/*.parquet" - config_name: "en-ID" data_files: - split: train path: "data/en-ID/*.parquet" - config_name: "en-IE" data_files: - split: train path: "data/en-IE/*.parquet" - config_name: "en-IL" data_files: - split: train path: "data/en-IL/*.parquet" - config_name: "en-IM" data_files: - split: train path: "data/en-IM/*.parquet" - config_name: "en-IN" data_files: - split: train path: "data/en-IN/*.parquet" - config_name: "en-IO" data_files: - split: train path: "data/en-IO/*.parquet" - config_name: "en-IQ" data_files: - split: train path: "data/en-IQ/*.parquet" - config_name: "en-IR" data_files: - split: train path: "data/en-IR/*.parquet" - config_name: "en-IS" data_files: - split: train path: "data/en-IS/*.parquet" - config_name: "en-IT" data_files: - split: train path: "data/en-IT/*.parquet" - config_name: "en-IX" data_files: - split: train path: "data/en-IX/*.parquet" - config_name: "en-JA" data_files: - split: train path: "data/en-JA/*.parquet" - config_name: "en-JE" data_files: - split: train path: "data/en-JE/*.parquet" - config_name: "en-JG" data_files: - split: train path: "data/en-JG/*.parquet" - config_name: "en-JM" data_files: - split: train path: "data/en-JM/*.parquet" - config_name: "en-JO" data_files: - split: train path: "data/en-JO/*.parquet" - config_name: "en-JP" data_files: - split: train path: "data/en-JP/*.parquet" - config_name: "en-KE" data_files: - split: train path: "data/en-KE/*.parquet" - config_name: "en-KG" data_files: - split: train path: "data/en-KG/*.parquet" - config_name: "en-KH" data_files: - split: train path: "data/en-KH/*.parquet" - config_name: "en-KI" data_files: - split: train path: "data/en-KI/*.parquet" - config_name: "en-KM" data_files: - split: train path: "data/en-KM/*.parquet" - config_name: "en-KN" data_files: - split: train path: "data/en-KN/*.parquet" - config_name: "en-KP" data_files: - split: train path: "data/en-KP/*.parquet" - config_name: "en-KR" data_files: - split: train path: "data/en-KR/*.parquet" - config_name: "en-KS" data_files: - split: train path: "data/en-KS/*.parquet" - config_name: "en-KW" data_files: - split: train path: "data/en-KW/*.parquet" - config_name: "en-KY" data_files: - split: train path: "data/en-KY/*.parquet" - config_name: "en-KZ" data_files: - split: train path: "data/en-KZ/*.parquet" - config_name: "en-LA" data_files: - split: train path: "data/en-LA/*.parquet" - config_name: "en-LB" data_files: - split: train path: "data/en-LB/*.parquet" - config_name: "en-LC" data_files: - split: train path: "data/en-LC/*.parquet" - config_name: "en-LI" data_files: - split: train path: "data/en-LI/*.parquet" - config_name: "en-LK" data_files: - split: train path: "data/en-LK/*.parquet" - config_name: "en-LM" data_files: - split: train path: "data/en-LM/*.parquet" - config_name: "en-LO" data_files: - split: train path: "data/en-LO/*.parquet" - config_name: "en-LP" data_files: - split: train path: "data/en-LP/*.parquet" - config_name: "en-LR" data_files: - split: train path: "data/en-LR/*.parquet" - config_name: "en-LS" data_files: - split: train path: "data/en-LS/*.parquet" - config_name: "en-LT" data_files: - split: train path: "data/en-LT/*.parquet" - config_name: "en-LU" data_files: - split: train path: "data/en-LU/*.parquet" - config_name: "en-LV" data_files: - split: train path: "data/en-LV/*.parquet" - config_name: "en-LY" data_files: - split: train path: "data/en-LY/*.parquet" - config_name: "en-MA" data_files: - split: train path: "data/en-MA/*.parquet" - config_name: "en-MC" data_files: - split: train path: "data/en-MC/*.parquet" - config_name: "en-MD" data_files: - split: train path: "data/en-MD/*.parquet" - config_name: "en-ME" data_files: - split: train path: "data/en-ME/*.parquet" - config_name: "en-MF" data_files: - split: train path: "data/en-MF/*.parquet" - config_name: "en-MG" data_files: - split: train path: "data/en-MG/*.parquet" - config_name: "en-MH" data_files: - split: train path: "data/en-MH/*.parquet" - config_name: "en-MI" data_files: - split: train path: "data/en-MI/*.parquet" - config_name: "en-MK" data_files: - split: train path: "data/en-MK/*.parquet" - config_name: "en-ML" data_files: - split: train path: "data/en-ML/*.parquet" - config_name: "en-MM" data_files: - split: train path: "data/en-MM/*.parquet" - config_name: "en-MN" data_files: - split: train path: "data/en-MN/*.parquet" - config_name: "en-MO" data_files: - split: train path: "data/en-MO/*.parquet" - config_name: "en-MP" data_files: - split: train path: "data/en-MP/*.parquet" - config_name: "en-MQ" data_files: - split: train path: "data/en-MQ/*.parquet" - config_name: "en-MR" data_files: - split: train path: "data/en-MR/*.parquet" - config_name: "en-MS" data_files: - split: train path: "data/en-MS/*.parquet" - config_name: "en-MT" data_files: - split: train path: "data/en-MT/*.parquet" - config_name: "en-MU" data_files: - split: train path: "data/en-MU/*.parquet" - config_name: "en-MV" data_files: - split: train path: "data/en-MV/*.parquet" - config_name: "en-MW" data_files: - split: train path: "data/en-MW/*.parquet" - config_name: "en-MX" data_files: - split: train path: "data/en-MX/*.parquet" - config_name: "en-MY" data_files: - split: train path: "data/en-MY/*.parquet" - config_name: "en-MZ" data_files: - split: train path: "data/en-MZ/*.parquet" - config_name: "en-NA" data_files: - split: train path: "data/en-NA/*.parquet" - config_name: "en-NB" data_files: - split: train path: "data/en-NB/*.parquet" - config_name: "en-NC" data_files: - split: train path: "data/en-NC/*.parquet" - config_name: "en-ND" data_files: - split: train path: "data/en-ND/*.parquet" - config_name: "en-NE" data_files: - split: train path: "data/en-NE/*.parquet" - config_name: "en-NF" data_files: - split: train path: "data/en-NF/*.parquet" - config_name: "en-NG" data_files: - split: train path: "data/en-NG/*.parquet" - config_name: "en-NI" data_files: - split: train path: "data/en-NI/*.parquet" - config_name: "en-NL" data_files: - split: train path: "data/en-NL/*.parquet" - config_name: "en-NN" data_files: - split: train path: "data/en-NN/*.parquet" - config_name: "en-NO" data_files: - split: train path: "data/en-NO/*.parquet" - config_name: "en-NP" data_files: - split: train path: "data/en-NP/*.parquet" - config_name: "en-NR" data_files: - split: train path: "data/en-NR/*.parquet" - config_name: "en-NS" data_files: - split: train path: "data/en-NS/*.parquet" - config_name: "en-NT" data_files: - split: train path: "data/en-NT/*.parquet" - config_name: "en-NU" data_files: - split: train path: "data/en-NU/*.parquet" - config_name: "en-NZ" data_files: - split: train path: "data/en-NZ/*.parquet" - config_name: "en-OC" data_files: - split: train path: "data/en-OC/*.parquet" - config_name: "en-OE" data_files: - split: train path: "data/en-OE/*.parquet" - config_name: "en-OJ" data_files: - split: train path: "data/en-OJ/*.parquet" - config_name: "en-OM" data_files: - split: train path: "data/en-OM/*.parquet" - config_name: "en-OT" data_files: - split: train path: "data/en-OT/*.parquet" - config_name: "en-PA" data_files: - split: train path: "data/en-PA/*.parquet" - config_name: "en-PE" data_files: - split: train path: "data/en-PE/*.parquet" - config_name: "en-PF" data_files: - split: train path: "data/en-PF/*.parquet" - config_name: "en-PG" data_files: - split: train path: "data/en-PG/*.parquet" - config_name: "en-PH" data_files: - split: train path: "data/en-PH/*.parquet" - config_name: "en-PI" data_files: - split: train path: "data/en-PI/*.parquet" - config_name: "en-PK" data_files: - split: train path: "data/en-PK/*.parquet" - config_name: "en-PL" data_files: - split: train path: "data/en-PL/*.parquet" - config_name: "en-PM" data_files: - split: train path: "data/en-PM/*.parquet" - config_name: "en-PN" data_files: - split: train path: "data/en-PN/*.parquet" - config_name: "en-PO" data_files: - split: train path: "data/en-PO/*.parquet" - config_name: "en-PR" data_files: - split: train path: "data/en-PR/*.parquet" - config_name: "en-PS" data_files: - split: train path: "data/en-PS/*.parquet" - config_name: "en-PT" data_files: - split: train path: "data/en-PT/*.parquet" - config_name: "en-PW" data_files: - split: train path: "data/en-PW/*.parquet" - config_name: "en-PY" data_files: - split: train path: "data/en-PY/*.parquet" - config_name: "en-QA" data_files: - split: train path: "data/en-QA/*.parquet" - config_name: "en-RE" data_files: - split: train path: "data/en-RE/*.parquet" - config_name: "en-RO" data_files: - split: train path: "data/en-RO/*.parquet" - config_name: "en-RS" data_files: - split: train path: "data/en-RS/*.parquet" - config_name: "en-RU" data_files: - split: train path: "data/en-RU/*.parquet" - config_name: "en-RW" data_files: - split: train path: "data/en-RW/*.parquet" - config_name: "en-SA" data_files: - split: train path: "data/en-SA/*.parquet" - config_name: "en-SB" data_files: - split: train path: "data/en-SB/*.parquet" - config_name: "en-SC" data_files: - split: train path: "data/en-SC/*.parquet" - config_name: "en-SD" data_files: - split: train path: "data/en-SD/*.parquet" - config_name: "en-SE" data_files: - split: train path: "data/en-SE/*.parquet" - config_name: "en-SG" data_files: - split: train path: "data/en-SG/*.parquet" - config_name: "en-SH" data_files: - split: train path: "data/en-SH/*.parquet" - config_name: "en-SI" data_files: - split: train path: "data/en-SI/*.parquet" - config_name: "en-SJ" data_files: - split: train path: "data/en-SJ/*.parquet" - config_name: "en-SK" data_files: - split: train path: "data/en-SK/*.parquet" - config_name: "en-SL" data_files: - split: train path: "data/en-SL/*.parquet" - config_name: "en-SM" data_files: - split: train path: "data/en-SM/*.parquet" - config_name: "en-SN" data_files: - split: train path: "data/en-SN/*.parquet" - config_name: "en-SO" data_files: - split: train path: "data/en-SO/*.parquet" - config_name: "en-SP" data_files: - split: train path: "data/en-SP/*.parquet" - config_name: "en-SR" data_files: - split: train path: "data/en-SR/*.parquet" - config_name: "en-SS" data_files: - split: train path: "data/en-SS/*.parquet" - config_name: "en-ST" data_files: - split: train path: "data/en-ST/*.parquet" - config_name: "en-SV" data_files: - split: train path: "data/en-SV/*.parquet" - config_name: "en-SW" data_files: - split: train path: "data/en-SW/*.parquet" - config_name: "en-SX" data_files: - split: train path: "data/en-SX/*.parquet" - config_name: "en-SY" data_files: - split: train path: "data/en-SY/*.parquet" - config_name: "en-SZ" data_files: - split: train path: "data/en-SZ/*.parquet" - config_name: "en-TC" data_files: - split: train path: "data/en-TC/*.parquet" - config_name: "en-TD" data_files: - split: train path: "data/en-TD/*.parquet" - config_name: "en-TF" data_files: - split: train path: "data/en-TF/*.parquet" - config_name: "en-TG" data_files: - split: train path: "data/en-TG/*.parquet" - config_name: "en-TH" data_files: - split: train path: "data/en-TH/*.parquet" - config_name: "en-TJ" data_files: - split: train path: "data/en-TJ/*.parquet" - config_name: "en-TK" data_files: - split: train path: "data/en-TK/*.parquet" - config_name: "en-TL" data_files: - split: train path: "data/en-TL/*.parquet" - config_name: "en-TM" data_files: - split: train path: "data/en-TM/*.parquet" - config_name: "en-TN" data_files: - split: train path: "data/en-TN/*.parquet" - config_name: "en-TO" data_files: - split: train path: "data/en-TO/*.parquet" - config_name: "en-TR" data_files: - split: train path: "data/en-TR/*.parquet" - config_name: "en-TT" data_files: - split: train path: "data/en-TT/*.parquet" - config_name: "en-TV" data_files: - split: train path: "data/en-TV/*.parquet" - config_name: "en-TW" data_files: - split: train path: "data/en-TW/*.parquet" - config_name: "en-TZ" data_files: - split: train path: "data/en-TZ/*.parquet" - config_name: "en-UA" data_files: - split: train path: "data/en-UA/*.parquet" - config_name: "en-UG" data_files: - split: train path: "data/en-UG/*.parquet" - config_name: "en-UK" data_files: - split: train path: "data/en-UK/*.parquet" - config_name: "en-UM" data_files: - split: train path: "data/en-UM/*.parquet" - config_name: "en-UN" data_files: - split: train path: "data/en-UN/*.parquet" - config_name: "en-US" data_files: - split: train path: "data/en-US/*.parquet" - config_name: "en-UY" data_files: - split: train path: "data/en-UY/*.parquet" - config_name: "en-UZ" data_files: - split: train path: "data/en-UZ/*.parquet" - config_name: "en-VA" data_files: - split: train path: "data/en-VA/*.parquet" - config_name: "en-VC" data_files: - split: train path: "data/en-VC/*.parquet" - config_name: "en-VE" data_files: - split: train path: "data/en-VE/*.parquet" - config_name: "en-VG" data_files: - split: train path: "data/en-VG/*.parquet" - config_name: "en-VI" data_files: - split: train path: "data/en-VI/*.parquet" - config_name: "en-VN" data_files: - split: train path: "data/en-VN/*.parquet" - config_name: "en-VR" data_files: - split: train path: "data/en-VR/*.parquet" - config_name: "en-VU" data_files: - split: train path: "data/en-VU/*.parquet" - config_name: "en-WA" data_files: - split: train path: "data/en-WA/*.parquet" - config_name: "en-WF" data_files: - split: train path: "data/en-WF/*.parquet" - config_name: "en-WI" data_files: - split: train path: "data/en-WI/*.parquet" - config_name: "en-WP" data_files: - split: train path: "data/en-WP/*.parquet" - config_name: "en-WR" data_files: - split: train path: "data/en-WR/*.parquet" - config_name: "en-WS" data_files: - split: train path: "data/en-WS/*.parquet" - config_name: "en-WW" data_files: - split: train path: "data/en-WW/*.parquet" - config_name: "en-XA" data_files: - split: train path: "data/en-XA/*.parquet" - config_name: "en-XB" data_files: - split: train path: "data/en-XB/*.parquet" - config_name: "en-XE" data_files: - split: train path: "data/en-XE/*.parquet" - config_name: "en-XI" data_files: - split: train path: "data/en-XI/*.parquet" - config_name: "en-XK" data_files: - split: train path: "data/en-XK/*.parquet" - config_name: "en-XL" data_files: - split: train path: "data/en-XL/*.parquet" - config_name: "en-XM" data_files: - split: train path: "data/en-XM/*.parquet" - config_name: "en-XX" data_files: - split: train path: "data/en-XX/*.parquet" - config_name: "en-YA" data_files: - split: train path: "data/en-YA/*.parquet" - config_name: "en-YE" data_files: - split: train path: "data/en-YE/*.parquet" - config_name: "en-YT" data_files: - split: train path: "data/en-YT/*.parquet" - config_name: "en-YU" data_files: - split: train path: "data/en-YU/*.parquet" - config_name: "en-YY" data_files: - split: train path: "data/en-YY/*.parquet" - config_name: "en-ZA" data_files: - split: train path: "data/en-ZA/*.parquet" - config_name: "en-ZH" data_files: - split: train path: "data/en-ZH/*.parquet" - config_name: "en-ZM" data_files: - split: train path: "data/en-ZM/*.parquet" - config_name: "en-ZW" data_files: - split: train path: "data/en-ZW/*.parquet" - config_name: "en-ZZ" data_files: - split: train path: "data/en-ZZ/*.parquet" - config_name: "eo-AI" data_files: - split: train path: "data/eo-AI/*.parquet" - config_name: "eo-BE" data_files: - split: train path: "data/eo-BE/*.parquet" - config_name: "eo-BR" data_files: - split: train path: "data/eo-BR/*.parquet" - config_name: "eo-BZ" data_files: - split: train path: "data/eo-BZ/*.parquet" - config_name: "eo-CA" data_files: - split: train path: "data/eo-CA/*.parquet" - config_name: "eo-CH" data_files: - split: train path: "data/eo-CH/*.parquet" - config_name: "eo-CL" data_files: - split: train path: "data/eo-CL/*.parquet" - config_name: "eo-CN" data_files: - split: train path: "data/eo-CN/*.parquet" - config_name: "eo-CO" data_files: - split: train path: "data/eo-CO/*.parquet" - config_name: "eo-CZ" data_files: - split: train path: "data/eo-CZ/*.parquet" - config_name: "eo-DE" data_files: - split: train path: "data/eo-DE/*.parquet" - config_name: "eo-DK" data_files: - split: train path: "data/eo-DK/*.parquet" - config_name: "eo-EO" data_files: - split: train path: "data/eo-EO/*.parquet" - config_name: "eo-EU" data_files: - split: train path: "data/eo-EU/*.parquet" - config_name: "eo-FR" data_files: - split: train path: "data/eo-FR/*.parquet" - config_name: "eo-GB" data_files: - split: train path: "data/eo-GB/*.parquet" - config_name: "eo-HR" data_files: - split: train path: "data/eo-HR/*.parquet" - config_name: "eo-HU" data_files: - split: train path: "data/eo-HU/*.parquet" - config_name: "eo-ID" data_files: - split: train path: "data/eo-ID/*.parquet" - config_name: "eo-IT" data_files: - split: train path: "data/eo-IT/*.parquet" - config_name: "eo-JP" data_files: - split: train path: "data/eo-JP/*.parquet" - config_name: "eo-LA" data_files: - split: train path: "data/eo-LA/*.parquet" - config_name: "eo-PL" data_files: - split: train path: "data/eo-PL/*.parquet" - config_name: "eo-RO" data_files: - split: train path: "data/eo-RO/*.parquet" - config_name: "eo-RU" data_files: - split: train path: "data/eo-RU/*.parquet" - config_name: "eo-SE" data_files: - split: train path: "data/eo-SE/*.parquet" - config_name: "eo-TK" data_files: - split: train path: "data/eo-TK/*.parquet" - config_name: "eo-TR" data_files: - split: train path: "data/eo-TR/*.parquet" - config_name: "eo-TW" data_files: - split: train path: "data/eo-TW/*.parquet" - config_name: "eo-US" data_files: - split: train path: "data/eo-US/*.parquet" - config_name: "eo-UY" data_files: - split: train path: "data/eo-UY/*.parquet" - config_name: "eo-XX" data_files: - split: train path: "data/eo-XX/*.parquet" - config_name: "es-AC" data_files: - split: train path: "data/es-AC/*.parquet" - config_name: "es-AD" data_files: - split: train path: "data/es-AD/*.parquet" - config_name: "es-AE" data_files: - split: train path: "data/es-AE/*.parquet" - config_name: "es-AG" data_files: - split: train path: "data/es-AG/*.parquet" - config_name: "es-AI" data_files: - split: train path: "data/es-AI/*.parquet" - config_name: "es-AL" data_files: - split: train path: "data/es-AL/*.parquet" - config_name: "es-AM" data_files: - split: train path: "data/es-AM/*.parquet" - config_name: "es-AN" data_files: - split: train path: "data/es-AN/*.parquet" - config_name: "es-AO" data_files: - split: train path: "data/es-AO/*.parquet" - config_name: "es-AQ" data_files: - split: train path: "data/es-AQ/*.parquet" - config_name: "es-AR" data_files: - split: train path: "data/es-AR/*.parquet" - config_name: "es-AS" data_files: - split: train path: "data/es-AS/*.parquet" - config_name: "es-AT" data_files: - split: train path: "data/es-AT/*.parquet" - config_name: "es-AU" data_files: - split: train path: "data/es-AU/*.parquet" - config_name: "es-AX" data_files: - split: train path: "data/es-AX/*.parquet" - config_name: "es-AZ" data_files: - split: train path: "data/es-AZ/*.parquet" - config_name: "es-BA" data_files: - split: train path: "data/es-BA/*.parquet" - config_name: "es-BD" data_files: - split: train path: "data/es-BD/*.parquet" - config_name: "es-BE" data_files: - split: train path: "data/es-BE/*.parquet" - config_name: "es-BG" data_files: - split: train path: "data/es-BG/*.parquet" - config_name: "es-BL" data_files: - split: train path: "data/es-BL/*.parquet" - config_name: "es-BM" data_files: - split: train path: "data/es-BM/*.parquet" - config_name: "es-BO" data_files: - split: train path: "data/es-BO/*.parquet" - config_name: "es-BQ" data_files: - split: train path: "data/es-BQ/*.parquet" - config_name: "es-BR" data_files: - split: train path: "data/es-BR/*.parquet" - config_name: "es-BS" data_files: - split: train path: "data/es-BS/*.parquet" - config_name: "es-BY" data_files: - split: train path: "data/es-BY/*.parquet" - config_name: "es-BZ" data_files: - split: train path: "data/es-BZ/*.parquet" - config_name: "es-CA" data_files: - split: train path: "data/es-CA/*.parquet" - config_name: "es-CC" data_files: - split: train path: "data/es-CC/*.parquet" - config_name: "es-CD" data_files: - split: train path: "data/es-CD/*.parquet" - config_name: "es-CF" data_files: - split: train path: "data/es-CF/*.parquet" - config_name: "es-CH" data_files: - split: train path: "data/es-CH/*.parquet" - config_name: "es-CL" data_files: - split: train path: "data/es-CL/*.parquet" - config_name: "es-CN" data_files: - split: train path: "data/es-CN/*.parquet" - config_name: "es-CO" data_files: - split: train path: "data/es-CO/*.parquet" - config_name: "es-CR" data_files: - split: train path: "data/es-CR/*.parquet" - config_name: "es-CU" data_files: - split: train path: "data/es-CU/*.parquet" - config_name: "es-CX" data_files: - split: train path: "data/es-CX/*.parquet" - config_name: "es-CY" data_files: - split: train path: "data/es-CY/*.parquet" - config_name: "es-CZ" data_files: - split: train path: "data/es-CZ/*.parquet" - config_name: "es-DE" data_files: - split: train path: "data/es-DE/*.parquet" - config_name: "es-DJ" data_files: - split: train path: "data/es-DJ/*.parquet" - config_name: "es-DK" data_files: - split: train path: "data/es-DK/*.parquet" - config_name: "es-DO" data_files: - split: train path: "data/es-DO/*.parquet" - config_name: "es-DZ" data_files: - split: train path: "data/es-DZ/*.parquet" - config_name: "es-EC" data_files: - split: train path: "data/es-EC/*.parquet" - config_name: "es-EE" data_files: - split: train path: "data/es-EE/*.parquet" - config_name: "es-EG" data_files: - split: train path: "data/es-EG/*.parquet" - config_name: "es-EN" data_files: - split: train path: "data/es-EN/*.parquet" - config_name: "es-ES" data_files: - split: train path: "data/es-ES/*.parquet" - config_name: "es-ET" data_files: - split: train path: "data/es-ET/*.parquet" - config_name: "es-EU" data_files: - split: train path: "data/es-EU/*.parquet" - config_name: "es-FI" data_files: - split: train path: "data/es-FI/*.parquet" - config_name: "es-FM" data_files: - split: train path: "data/es-FM/*.parquet" - config_name: "es-FO" data_files: - split: train path: "data/es-FO/*.parquet" - config_name: "es-FR" data_files: - split: train path: "data/es-FR/*.parquet" - config_name: "es-GA" data_files: - split: train path: "data/es-GA/*.parquet" - config_name: "es-GB" data_files: - split: train path: "data/es-GB/*.parquet" - config_name: "es-GD" data_files: - split: train path: "data/es-GD/*.parquet" - config_name: "es-GG" data_files: - split: train path: "data/es-GG/*.parquet" - config_name: "es-GI" data_files: - split: train path: "data/es-GI/*.parquet" - config_name: "es-GL" data_files: - split: train path: "data/es-GL/*.parquet" - config_name: "es-GP" data_files: - split: train path: "data/es-GP/*.parquet" - config_name: "es-GQ" data_files: - split: train path: "data/es-GQ/*.parquet" - config_name: "es-GR" data_files: - split: train path: "data/es-GR/*.parquet" - config_name: "es-GT" data_files: - split: train path: "data/es-GT/*.parquet" - config_name: "es-HK" data_files: - split: train path: "data/es-HK/*.parquet" - config_name: "es-HN" data_files: - split: train path: "data/es-HN/*.parquet" - config_name: "es-HR" data_files: - split: train path: "data/es-HR/*.parquet" - config_name: "es-HT" data_files: - split: train path: "data/es-HT/*.parquet" - config_name: "es-HU" data_files: - split: train path: "data/es-HU/*.parquet" - config_name: "es-IA" data_files: - split: train path: "data/es-IA/*.parquet" - config_name: "es-ID" data_files: - split: train path: "data/es-ID/*.parquet" - config_name: "es-IE" data_files: - split: train path: "data/es-IE/*.parquet" - config_name: "es-IL" data_files: - split: train path: "data/es-IL/*.parquet" - config_name: "es-IM" data_files: - split: train path: "data/es-IM/*.parquet" - config_name: "es-IN" data_files: - split: train path: "data/es-IN/*.parquet" - config_name: "es-IO" data_files: - split: train path: "data/es-IO/*.parquet" - config_name: "es-IR" data_files: - split: train path: "data/es-IR/*.parquet" - config_name: "es-IS" data_files: - split: train path: "data/es-IS/*.parquet" - config_name: "es-IT" data_files: - split: train path: "data/es-IT/*.parquet" - config_name: "es-JP" data_files: - split: train path: "data/es-JP/*.parquet" - config_name: "es-KE" data_files: - split: train path: "data/es-KE/*.parquet" - config_name: "es-KG" data_files: - split: train path: "data/es-KG/*.parquet" - config_name: "es-KR" data_files: - split: train path: "data/es-KR/*.parquet" - config_name: "es-KZ" data_files: - split: train path: "data/es-KZ/*.parquet" - config_name: "es-LA" data_files: - split: train path: "data/es-LA/*.parquet" - config_name: "es-LB" data_files: - split: train path: "data/es-LB/*.parquet" - config_name: "es-LC" data_files: - split: train path: "data/es-LC/*.parquet" - config_name: "es-LI" data_files: - split: train path: "data/es-LI/*.parquet" - config_name: "es-LK" data_files: - split: train path: "data/es-LK/*.parquet" - config_name: "es-LT" data_files: - split: train path: "data/es-LT/*.parquet" - config_name: "es-LU" data_files: - split: train path: "data/es-LU/*.parquet" - config_name: "es-LV" data_files: - split: train path: "data/es-LV/*.parquet" - config_name: "es-LX" data_files: - split: train path: "data/es-LX/*.parquet" - config_name: "es-LY" data_files: - split: train path: "data/es-LY/*.parquet" - config_name: "es-MA" data_files: - split: train path: "data/es-MA/*.parquet" - config_name: "es-MC" data_files: - split: train path: "data/es-MC/*.parquet" - config_name: "es-MD" data_files: - split: train path: "data/es-MD/*.parquet" - config_name: "es-ME" data_files: - split: train path: "data/es-ME/*.parquet" - config_name: "es-MF" data_files: - split: train path: "data/es-MF/*.parquet" - config_name: "es-MK" data_files: - split: train path: "data/es-MK/*.parquet" - config_name: "es-ML" data_files: - split: train path: "data/es-ML/*.parquet" - config_name: "es-MQ" data_files: - split: train path: "data/es-MQ/*.parquet" - config_name: "es-MS" data_files: - split: train path: "data/es-MS/*.parquet" - config_name: "es-MX" data_files: - split: train path: "data/es-MX/*.parquet" - config_name: "es-MY" data_files: - split: train path: "data/es-MY/*.parquet" - config_name: "es-NA" data_files: - split: train path: "data/es-NA/*.parquet" - config_name: "es-NG" data_files: - split: train path: "data/es-NG/*.parquet" - config_name: "es-NI" data_files: - split: train path: "data/es-NI/*.parquet" - config_name: "es-NL" data_files: - split: train path: "data/es-NL/*.parquet" - config_name: "es-NO" data_files: - split: train path: "data/es-NO/*.parquet" - config_name: "es-NU" data_files: - split: train path: "data/es-NU/*.parquet" - config_name: "es-NZ" data_files: - split: train path: "data/es-NZ/*.parquet" - config_name: "es-PA" data_files: - split: train path: "data/es-PA/*.parquet" - config_name: "es-PE" data_files: - split: train path: "data/es-PE/*.parquet" - config_name: "es-PH" data_files: - split: train path: "data/es-PH/*.parquet" - config_name: "es-PK" data_files: - split: train path: "data/es-PK/*.parquet" - config_name: "es-PL" data_files: - split: train path: "data/es-PL/*.parquet" - config_name: "es-PM" data_files: - split: train path: "data/es-PM/*.parquet" - config_name: "es-PR" data_files: - split: train path: "data/es-PR/*.parquet" - config_name: "es-PT" data_files: - split: train path: "data/es-PT/*.parquet" - config_name: "es-PW" data_files: - split: train path: "data/es-PW/*.parquet" - config_name: "es-PY" data_files: - split: train path: "data/es-PY/*.parquet" - config_name: "es-RK" data_files: - split: train path: "data/es-RK/*.parquet" - config_name: "es-RO" data_files: - split: train path: "data/es-RO/*.parquet" - config_name: "es-RS" data_files: - split: train path: "data/es-RS/*.parquet" - config_name: "es-RU" data_files: - split: train path: "data/es-RU/*.parquet" - config_name: "es-SA" data_files: - split: train path: "data/es-SA/*.parquet" - config_name: "es-SC" data_files: - split: train path: "data/es-SC/*.parquet" - config_name: "es-SE" data_files: - split: train path: "data/es-SE/*.parquet" - config_name: "es-SG" data_files: - split: train path: "data/es-SG/*.parquet" - config_name: "es-SH" data_files: - split: train path: "data/es-SH/*.parquet" - config_name: "es-SI" data_files: - split: train path: "data/es-SI/*.parquet" - config_name: "es-SK" data_files: - split: train path: "data/es-SK/*.parquet" - config_name: "es-SN" data_files: - split: train path: "data/es-SN/*.parquet" - config_name: "es-SO" data_files: - split: train path: "data/es-SO/*.parquet" - config_name: "es-SP" data_files: - split: train path: "data/es-SP/*.parquet" - config_name: "es-ST" data_files: - split: train path: "data/es-ST/*.parquet" - config_name: "es-SV" data_files: - split: train path: "data/es-SV/*.parquet" - config_name: "es-TH" data_files: - split: train path: "data/es-TH/*.parquet" - config_name: "es-TK" data_files: - split: train path: "data/es-TK/*.parquet" - config_name: "es-TL" data_files: - split: train path: "data/es-TL/*.parquet" - config_name: "es-TN" data_files: - split: train path: "data/es-TN/*.parquet" - config_name: "es-TO" data_files: - split: train path: "data/es-TO/*.parquet" - config_name: "es-TR" data_files: - split: train path: "data/es-TR/*.parquet" - config_name: "es-TV" data_files: - split: train path: "data/es-TV/*.parquet" - config_name: "es-TW" data_files: - split: train path: "data/es-TW/*.parquet" - config_name: "es-TZ" data_files: - split: train path: "data/es-TZ/*.parquet" - config_name: "es-UA" data_files: - split: train path: "data/es-UA/*.parquet" - config_name: "es-UK" data_files: - split: train path: "data/es-UK/*.parquet" - config_name: "es-UM" data_files: - split: train path: "data/es-UM/*.parquet" - config_name: "es-UN" data_files: - split: train path: "data/es-UN/*.parquet" - config_name: "es-US" data_files: - split: train path: "data/es-US/*.parquet" - config_name: "es-UY" data_files: - split: train path: "data/es-UY/*.parquet" - config_name: "es-UZ" data_files: - split: train path: "data/es-UZ/*.parquet" - config_name: "es-VA" data_files: - split: train path: "data/es-VA/*.parquet" - config_name: "es-VC" data_files: - split: train path: "data/es-VC/*.parquet" - config_name: "es-VE" data_files: - split: train path: "data/es-VE/*.parquet" - config_name: "es-VG" data_files: - split: train path: "data/es-VG/*.parquet" - config_name: "es-VI" data_files: - split: train path: "data/es-VI/*.parquet" - config_name: "es-VN" data_files: - split: train path: "data/es-VN/*.parquet" - config_name: "es-VU" data_files: - split: train path: "data/es-VU/*.parquet" - config_name: "es-WF" data_files: - split: train path: "data/es-WF/*.parquet" - config_name: "es-WS" data_files: - split: train path: "data/es-WS/*.parquet" - config_name: "es-WW" data_files: - split: train path: "data/es-WW/*.parquet" - config_name: "es-XL" data_files: - split: train path: "data/es-XL/*.parquet" - config_name: "es-XM" data_files: - split: train path: "data/es-XM/*.parquet" - config_name: "es-XX" data_files: - split: train path: "data/es-XX/*.parquet" - config_name: "es-ZA" data_files: - split: train path: "data/es-ZA/*.parquet" - config_name: "es-ZB" data_files: - split: train path: "data/es-ZB/*.parquet" - config_name: "es-ZW" data_files: - split: train path: "data/es-ZW/*.parquet" - config_name: "es-ZZ" data_files: - split: train path: "data/es-ZZ/*.parquet" - config_name: "et-AL" data_files: - split: train path: "data/et-AL/*.parquet" - config_name: "et-AT" data_files: - split: train path: "data/et-AT/*.parquet" - config_name: "et-BR" data_files: - split: train path: "data/et-BR/*.parquet" - config_name: "et-CA" data_files: - split: train path: "data/et-CA/*.parquet" - config_name: "et-CC" data_files: - split: train path: "data/et-CC/*.parquet" - config_name: "et-CF" data_files: - split: train path: "data/et-CF/*.parquet" - config_name: "et-CH" data_files: - split: train path: "data/et-CH/*.parquet" - config_name: "et-CN" data_files: - split: train path: "data/et-CN/*.parquet" - config_name: "et-CO" data_files: - split: train path: "data/et-CO/*.parquet" - config_name: "et-DE" data_files: - split: train path: "data/et-DE/*.parquet" - config_name: "et-DO" data_files: - split: train path: "data/et-DO/*.parquet" - config_name: "et-EE" data_files: - split: train path: "data/et-EE/*.parquet" - config_name: "et-ES" data_files: - split: train path: "data/et-ES/*.parquet" - config_name: "et-ET" data_files: - split: train path: "data/et-ET/*.parquet" - config_name: "et-EU" data_files: - split: train path: "data/et-EU/*.parquet" - config_name: "et-FI" data_files: - split: train path: "data/et-FI/*.parquet" - config_name: "et-FM" data_files: - split: train path: "data/et-FM/*.parquet" - config_name: "et-FR" data_files: - split: train path: "data/et-FR/*.parquet" - config_name: "et-GB" data_files: - split: train path: "data/et-GB/*.parquet" - config_name: "et-HU" data_files: - split: train path: "data/et-HU/*.parquet" - config_name: "et-IN" data_files: - split: train path: "data/et-IN/*.parquet" - config_name: "et-IO" data_files: - split: train path: "data/et-IO/*.parquet" - config_name: "et-IT" data_files: - split: train path: "data/et-IT/*.parquet" - config_name: "et-LT" data_files: - split: train path: "data/et-LT/*.parquet" - config_name: "et-LV" data_files: - split: train path: "data/et-LV/*.parquet" - config_name: "et-ME" data_files: - split: train path: "data/et-ME/*.parquet" - config_name: "et-NL" data_files: - split: train path: "data/et-NL/*.parquet" - config_name: "et-NZ" data_files: - split: train path: "data/et-NZ/*.parquet" - config_name: "et-PL" data_files: - split: train path: "data/et-PL/*.parquet" - config_name: "et-RU" data_files: - split: train path: "data/et-RU/*.parquet" - config_name: "et-SE" data_files: - split: train path: "data/et-SE/*.parquet" - config_name: "et-SH" data_files: - split: train path: "data/et-SH/*.parquet" - config_name: "et-SI" data_files: - split: train path: "data/et-SI/*.parquet" - config_name: "et-ST" data_files: - split: train path: "data/et-ST/*.parquet" - config_name: "et-TV" data_files: - split: train path: "data/et-TV/*.parquet" - config_name: "et-UA" data_files: - split: train path: "data/et-UA/*.parquet" - config_name: "et-US" data_files: - split: train path: "data/et-US/*.parquet" - config_name: "et-WS" data_files: - split: train path: "data/et-WS/*.parquet" - config_name: "et-XX" data_files: - split: train path: "data/et-XX/*.parquet" - config_name: "et-ZA" data_files: - split: train path: "data/et-ZA/*.parquet" - config_name: "eu-AT" data_files: - split: train path: "data/eu-AT/*.parquet" - config_name: "eu-CN" data_files: - split: train path: "data/eu-CN/*.parquet" - config_name: "eu-CZ" data_files: - split: train path: "data/eu-CZ/*.parquet" - config_name: "eu-DE" data_files: - split: train path: "data/eu-DE/*.parquet" - config_name: "eu-EH" data_files: - split: train path: "data/eu-EH/*.parquet" - config_name: "eu-ES" data_files: - split: train path: "data/eu-ES/*.parquet" - config_name: "eu-EU" data_files: - split: train path: "data/eu-EU/*.parquet" - config_name: "eu-FR" data_files: - split: train path: "data/eu-FR/*.parquet" - config_name: "eu-GB" data_files: - split: train path: "data/eu-GB/*.parquet" - config_name: "eu-IE" data_files: - split: train path: "data/eu-IE/*.parquet" - config_name: "eu-IT" data_files: - split: train path: "data/eu-IT/*.parquet" - config_name: "eu-LA" data_files: - split: train path: "data/eu-LA/*.parquet" - config_name: "eu-PM" data_files: - split: train path: "data/eu-PM/*.parquet" - config_name: "eu-PT" data_files: - split: train path: "data/eu-PT/*.parquet" - config_name: "eu-TV" data_files: - split: train path: "data/eu-TV/*.parquet" - config_name: "eu-UA" data_files: - split: train path: "data/eu-UA/*.parquet" - config_name: "eu-US" data_files: - split: train path: "data/eu-US/*.parquet" - config_name: "eu-XX" data_files: - split: train path: "data/eu-XX/*.parquet" - config_name: "fa-AA" data_files: - split: train path: "data/fa-AA/*.parquet" - config_name: "fa-AF" data_files: - split: train path: "data/fa-AF/*.parquet" - config_name: "fa-AI" data_files: - split: train path: "data/fa-AI/*.parquet" - config_name: "fa-AM" data_files: - split: train path: "data/fa-AM/*.parquet" - config_name: "fa-AT" data_files: - split: train path: "data/fa-AT/*.parquet" - config_name: "fa-AU" data_files: - split: train path: "data/fa-AU/*.parquet" - config_name: "fa-AZ" data_files: - split: train path: "data/fa-AZ/*.parquet" - config_name: "fa-BE" data_files: - split: train path: "data/fa-BE/*.parquet" - config_name: "fa-BR" data_files: - split: train path: "data/fa-BR/*.parquet" - config_name: "fa-BZ" data_files: - split: train path: "data/fa-BZ/*.parquet" - config_name: "fa-CA" data_files: - split: train path: "data/fa-CA/*.parquet" - config_name: "fa-CC" data_files: - split: train path: "data/fa-CC/*.parquet" - config_name: "fa-CF" data_files: - split: train path: "data/fa-CF/*.parquet" - config_name: "fa-CH" data_files: - split: train path: "data/fa-CH/*.parquet" - config_name: "fa-CN" data_files: - split: train path: "data/fa-CN/*.parquet" - config_name: "fa-CO" data_files: - split: train path: "data/fa-CO/*.parquet" - config_name: "fa-CZ" data_files: - split: train path: "data/fa-CZ/*.parquet" - config_name: "fa-DE" data_files: - split: train path: "data/fa-DE/*.parquet" - config_name: "fa-DJ" data_files: - split: train path: "data/fa-DJ/*.parquet" - config_name: "fa-DK" data_files: - split: train path: "data/fa-DK/*.parquet" - config_name: "fa-EG" data_files: - split: train path: "data/fa-EG/*.parquet" - config_name: "fa-ES" data_files: - split: train path: "data/fa-ES/*.parquet" - config_name: "fa-ET" data_files: - split: train path: "data/fa-ET/*.parquet" - config_name: "fa-EU" data_files: - split: train path: "data/fa-EU/*.parquet" - config_name: "fa-FA" data_files: - split: train path: "data/fa-FA/*.parquet" - config_name: "fa-FI" data_files: - split: train path: "data/fa-FI/*.parquet" - config_name: "fa-FM" data_files: - split: train path: "data/fa-FM/*.parquet" - config_name: "fa-FR" data_files: - split: train path: "data/fa-FR/*.parquet" - config_name: "fa-GA" data_files: - split: train path: "data/fa-GA/*.parquet" - config_name: "fa-GB" data_files: - split: train path: "data/fa-GB/*.parquet" - config_name: "fa-GE" data_files: - split: train path: "data/fa-GE/*.parquet" - config_name: "fa-GQ" data_files: - split: train path: "data/fa-GQ/*.parquet" - config_name: "fa-HR" data_files: - split: train path: "data/fa-HR/*.parquet" - config_name: "fa-IM" data_files: - split: train path: "data/fa-IM/*.parquet" - config_name: "fa-IN" data_files: - split: train path: "data/fa-IN/*.parquet" - config_name: "fa-IO" data_files: - split: train path: "data/fa-IO/*.parquet" - config_name: "fa-IR" data_files: - split: train path: "data/fa-IR/*.parquet" - config_name: "fa-IT" data_files: - split: train path: "data/fa-IT/*.parquet" - config_name: "fa-JP" data_files: - split: train path: "data/fa-JP/*.parquet" - config_name: "fa-ME" data_files: - split: train path: "data/fa-ME/*.parquet" - config_name: "fa-ML" data_files: - split: train path: "data/fa-ML/*.parquet" - config_name: "fa-NL" data_files: - split: train path: "data/fa-NL/*.parquet" - config_name: "fa-NO" data_files: - split: train path: "data/fa-NO/*.parquet" - config_name: "fa-NU" data_files: - split: train path: "data/fa-NU/*.parquet" - config_name: "fa-PL" data_files: - split: train path: "data/fa-PL/*.parquet" - config_name: "fa-PW" data_files: - split: train path: "data/fa-PW/*.parquet" - config_name: "fa-RI" data_files: - split: train path: "data/fa-RI/*.parquet" - config_name: "fa-RU" data_files: - split: train path: "data/fa-RU/*.parquet" - config_name: "fa-SA" data_files: - split: train path: "data/fa-SA/*.parquet" - config_name: "fa-SC" data_files: - split: train path: "data/fa-SC/*.parquet" - config_name: "fa-SE" data_files: - split: train path: "data/fa-SE/*.parquet" - config_name: "fa-SH" data_files: - split: train path: "data/fa-SH/*.parquet" - config_name: "fa-ST" data_files: - split: train path: "data/fa-ST/*.parquet" - config_name: "fa-SY" data_files: - split: train path: "data/fa-SY/*.parquet" - config_name: "fa-TC" data_files: - split: train path: "data/fa-TC/*.parquet" - config_name: "fa-TK" data_files: - split: train path: "data/fa-TK/*.parquet" - config_name: "fa-TL" data_files: - split: train path: "data/fa-TL/*.parquet" - config_name: "fa-TO" data_files: - split: train path: "data/fa-TO/*.parquet" - config_name: "fa-TR" data_files: - split: train path: "data/fa-TR/*.parquet" - config_name: "fa-TV" data_files: - split: train path: "data/fa-TV/*.parquet" - config_name: "fa-US" data_files: - split: train path: "data/fa-US/*.parquet" - config_name: "fa-WS" data_files: - split: train path: "data/fa-WS/*.parquet" - config_name: "fa-XX" data_files: - split: train path: "data/fa-XX/*.parquet" - config_name: "fa-YE" data_files: - split: train path: "data/fa-YE/*.parquet" - config_name: "fa-ZA" data_files: - split: train path: "data/fa-ZA/*.parquet" - config_name: "fi-AI" data_files: - split: train path: "data/fi-AI/*.parquet" - config_name: "fi-AL" data_files: - split: train path: "data/fi-AL/*.parquet" - config_name: "fi-AT" data_files: - split: train path: "data/fi-AT/*.parquet" - config_name: "fi-AU" data_files: - split: train path: "data/fi-AU/*.parquet" - config_name: "fi-AX" data_files: - split: train path: "data/fi-AX/*.parquet" - config_name: "fi-BE" data_files: - split: train path: "data/fi-BE/*.parquet" - config_name: "fi-BR" data_files: - split: train path: "data/fi-BR/*.parquet" - config_name: "fi-CA" data_files: - split: train path: "data/fi-CA/*.parquet" - config_name: "fi-CC" data_files: - split: train path: "data/fi-CC/*.parquet" - config_name: "fi-CF" data_files: - split: train path: "data/fi-CF/*.parquet" - config_name: "fi-CH" data_files: - split: train path: "data/fi-CH/*.parquet" - config_name: "fi-CL" data_files: - split: train path: "data/fi-CL/*.parquet" - config_name: "fi-CN" data_files: - split: train path: "data/fi-CN/*.parquet" - config_name: "fi-CO" data_files: - split: train path: "data/fi-CO/*.parquet" - config_name: "fi-CZ" data_files: - split: train path: "data/fi-CZ/*.parquet" - config_name: "fi-DE" data_files: - split: train path: "data/fi-DE/*.parquet" - config_name: "fi-DK" data_files: - split: train path: "data/fi-DK/*.parquet" - config_name: "fi-EE" data_files: - split: train path: "data/fi-EE/*.parquet" - config_name: "fi-EN" data_files: - split: train path: "data/fi-EN/*.parquet" - config_name: "fi-ES" data_files: - split: train path: "data/fi-ES/*.parquet" - config_name: "fi-EU" data_files: - split: train path: "data/fi-EU/*.parquet" - config_name: "fi-FI" data_files: - split: train path: "data/fi-FI/*.parquet" - config_name: "fi-FL" data_files: - split: train path: "data/fi-FL/*.parquet" - config_name: "fi-FM" data_files: - split: train path: "data/fi-FM/*.parquet" - config_name: "fi-FR" data_files: - split: train path: "data/fi-FR/*.parquet" - config_name: "fi-GA" data_files: - split: train path: "data/fi-GA/*.parquet" - config_name: "fi-GB" data_files: - split: train path: "data/fi-GB/*.parquet" - config_name: "fi-GL" data_files: - split: train path: "data/fi-GL/*.parquet" - config_name: "fi-GR" data_files: - split: train path: "data/fi-GR/*.parquet" - config_name: "fi-HU" data_files: - split: train path: "data/fi-HU/*.parquet" - config_name: "fi-ID" data_files: - split: train path: "data/fi-ID/*.parquet" - config_name: "fi-IE" data_files: - split: train path: "data/fi-IE/*.parquet" - config_name: "fi-IL" data_files: - split: train path: "data/fi-IL/*.parquet" - config_name: "fi-IN" data_files: - split: train path: "data/fi-IN/*.parquet" - config_name: "fi-IO" data_files: - split: train path: "data/fi-IO/*.parquet" - config_name: "fi-IS" data_files: - split: train path: "data/fi-IS/*.parquet" - config_name: "fi-IT" data_files: - split: train path: "data/fi-IT/*.parquet" - config_name: "fi-JE" data_files: - split: train path: "data/fi-JE/*.parquet" - config_name: "fi-KZ" data_files: - split: train path: "data/fi-KZ/*.parquet" - config_name: "fi-LA" data_files: - split: train path: "data/fi-LA/*.parquet" - config_name: "fi-LI" data_files: - split: train path: "data/fi-LI/*.parquet" - config_name: "fi-LT" data_files: - split: train path: "data/fi-LT/*.parquet" - config_name: "fi-LV" data_files: - split: train path: "data/fi-LV/*.parquet" - config_name: "fi-ME" data_files: - split: train path: "data/fi-ME/*.parquet" - config_name: "fi-MU" data_files: - split: train path: "data/fi-MU/*.parquet" - config_name: "fi-MX" data_files: - split: train path: "data/fi-MX/*.parquet" - config_name: "fi-NL" data_files: - split: train path: "data/fi-NL/*.parquet" - config_name: "fi-NO" data_files: - split: train path: "data/fi-NO/*.parquet" - config_name: "fi-NU" data_files: - split: train path: "data/fi-NU/*.parquet" - config_name: "fi-NZ" data_files: - split: train path: "data/fi-NZ/*.parquet" - config_name: "fi-PL" data_files: - split: train path: "data/fi-PL/*.parquet" - config_name: "fi-PW" data_files: - split: train path: "data/fi-PW/*.parquet" - config_name: "fi-RO" data_files: - split: train path: "data/fi-RO/*.parquet" - config_name: "fi-RU" data_files: - split: train path: "data/fi-RU/*.parquet" - config_name: "fi-SE" data_files: - split: train path: "data/fi-SE/*.parquet" - config_name: "fi-SH" data_files: - split: train path: "data/fi-SH/*.parquet" - config_name: "fi-SI" data_files: - split: train path: "data/fi-SI/*.parquet" - config_name: "fi-SK" data_files: - split: train path: "data/fi-SK/*.parquet" - config_name: "fi-TK" data_files: - split: train path: "data/fi-TK/*.parquet" - config_name: "fi-TR" data_files: - split: train path: "data/fi-TR/*.parquet" - config_name: "fi-TV" data_files: - split: train path: "data/fi-TV/*.parquet" - config_name: "fi-UA" data_files: - split: train path: "data/fi-UA/*.parquet" - config_name: "fi-UK" data_files: - split: train path: "data/fi-UK/*.parquet" - config_name: "fi-US" data_files: - split: train path: "data/fi-US/*.parquet" - config_name: "fi-VN" data_files: - split: train path: "data/fi-VN/*.parquet" - config_name: "fi-WS" data_files: - split: train path: "data/fi-WS/*.parquet" - config_name: "fi-XX" data_files: - split: train path: "data/fi-XX/*.parquet" - config_name: "fi-ZA" data_files: - split: train path: "data/fi-ZA/*.parquet" - config_name: "fr-AA" data_files: - split: train path: "data/fr-AA/*.parquet" - config_name: "fr-AD" data_files: - split: train path: "data/fr-AD/*.parquet" - config_name: "fr-AE" data_files: - split: train path: "data/fr-AE/*.parquet" - config_name: "fr-AF" data_files: - split: train path: "data/fr-AF/*.parquet" - config_name: "fr-AG" data_files: - split: train path: "data/fr-AG/*.parquet" - config_name: "fr-AI" data_files: - split: train path: "data/fr-AI/*.parquet" - config_name: "fr-AL" data_files: - split: train path: "data/fr-AL/*.parquet" - config_name: "fr-AM" data_files: - split: train path: "data/fr-AM/*.parquet" - config_name: "fr-AO" data_files: - split: train path: "data/fr-AO/*.parquet" - config_name: "fr-AR" data_files: - split: train path: "data/fr-AR/*.parquet" - config_name: "fr-AS" data_files: - split: train path: "data/fr-AS/*.parquet" - config_name: "fr-AT" data_files: - split: train path: "data/fr-AT/*.parquet" - config_name: "fr-AU" data_files: - split: train path: "data/fr-AU/*.parquet" - config_name: "fr-AX" data_files: - split: train path: "data/fr-AX/*.parquet" - config_name: "fr-AZ" data_files: - split: train path: "data/fr-AZ/*.parquet" - config_name: "fr-BA" data_files: - split: train path: "data/fr-BA/*.parquet" - config_name: "fr-BD" data_files: - split: train path: "data/fr-BD/*.parquet" - config_name: "fr-BE" data_files: - split: train path: "data/fr-BE/*.parquet" - config_name: "fr-BF" data_files: - split: train path: "data/fr-BF/*.parquet" - config_name: "fr-BG" data_files: - split: train path: "data/fr-BG/*.parquet" - config_name: "fr-BI" data_files: - split: train path: "data/fr-BI/*.parquet" - config_name: "fr-BJ" data_files: - split: train path: "data/fr-BJ/*.parquet" - config_name: "fr-BM" data_files: - split: train path: "data/fr-BM/*.parquet" - config_name: "fr-BO" data_files: - split: train path: "data/fr-BO/*.parquet" - config_name: "fr-BR" data_files: - split: train path: "data/fr-BR/*.parquet" - config_name: "fr-BW" data_files: - split: train path: "data/fr-BW/*.parquet" - config_name: "fr-BY" data_files: - split: train path: "data/fr-BY/*.parquet" - config_name: "fr-BZ" data_files: - split: train path: "data/fr-BZ/*.parquet" - config_name: "fr-CA" data_files: - split: train path: "data/fr-CA/*.parquet" - config_name: "fr-CC" data_files: - split: train path: "data/fr-CC/*.parquet" - config_name: "fr-CD" data_files: - split: train path: "data/fr-CD/*.parquet" - config_name: "fr-CF" data_files: - split: train path: "data/fr-CF/*.parquet" - config_name: "fr-CG" data_files: - split: train path: "data/fr-CG/*.parquet" - config_name: "fr-CH" data_files: - split: train path: "data/fr-CH/*.parquet" - config_name: "fr-CI" data_files: - split: train path: "data/fr-CI/*.parquet" - config_name: "fr-CL" data_files: - split: train path: "data/fr-CL/*.parquet" - config_name: "fr-CM" data_files: - split: train path: "data/fr-CM/*.parquet" - config_name: "fr-CN" data_files: - split: train path: "data/fr-CN/*.parquet" - config_name: "fr-CO" data_files: - split: train path: "data/fr-CO/*.parquet" - config_name: "fr-CR" data_files: - split: train path: "data/fr-CR/*.parquet" - config_name: "fr-CS" data_files: - split: train path: "data/fr-CS/*.parquet" - config_name: "fr-CU" data_files: - split: train path: "data/fr-CU/*.parquet" - config_name: "fr-CX" data_files: - split: train path: "data/fr-CX/*.parquet" - config_name: "fr-CY" data_files: - split: train path: "data/fr-CY/*.parquet" - config_name: "fr-CZ" data_files: - split: train path: "data/fr-CZ/*.parquet" - config_name: "fr-DE" data_files: - split: train path: "data/fr-DE/*.parquet" - config_name: "fr-DJ" data_files: - split: train path: "data/fr-DJ/*.parquet" - config_name: "fr-DK" data_files: - split: train path: "data/fr-DK/*.parquet" - config_name: "fr-DO" data_files: - split: train path: "data/fr-DO/*.parquet" - config_name: "fr-DZ" data_files: - split: train path: "data/fr-DZ/*.parquet" - config_name: "fr-EC" data_files: - split: train path: "data/fr-EC/*.parquet" - config_name: "fr-EE" data_files: - split: train path: "data/fr-EE/*.parquet" - config_name: "fr-EG" data_files: - split: train path: "data/fr-EG/*.parquet" - config_name: "fr-EN" data_files: - split: train path: "data/fr-EN/*.parquet" - config_name: "fr-ES" data_files: - split: train path: "data/fr-ES/*.parquet" - config_name: "fr-EU" data_files: - split: train path: "data/fr-EU/*.parquet" - config_name: "fr-FI" data_files: - split: train path: "data/fr-FI/*.parquet" - config_name: "fr-FJ" data_files: - split: train path: "data/fr-FJ/*.parquet" - config_name: "fr-FM" data_files: - split: train path: "data/fr-FM/*.parquet" - config_name: "fr-FO" data_files: - split: train path: "data/fr-FO/*.parquet" - config_name: "fr-FR" data_files: - split: train path: "data/fr-FR/*.parquet" - config_name: "fr-GA" data_files: - split: train path: "data/fr-GA/*.parquet" - config_name: "fr-GB" data_files: - split: train path: "data/fr-GB/*.parquet" - config_name: "fr-GD" data_files: - split: train path: "data/fr-GD/*.parquet" - config_name: "fr-GE" data_files: - split: train path: "data/fr-GE/*.parquet" - config_name: "fr-GF" data_files: - split: train path: "data/fr-GF/*.parquet" - config_name: "fr-GG" data_files: - split: train path: "data/fr-GG/*.parquet" - config_name: "fr-GN" data_files: - split: train path: "data/fr-GN/*.parquet" - config_name: "fr-GP" data_files: - split: train path: "data/fr-GP/*.parquet" - config_name: "fr-GQ" data_files: - split: train path: "data/fr-GQ/*.parquet" - config_name: "fr-GR" data_files: - split: train path: "data/fr-GR/*.parquet" - config_name: "fr-GT" data_files: - split: train path: "data/fr-GT/*.parquet" - config_name: "fr-GX" data_files: - split: train path: "data/fr-GX/*.parquet" - config_name: "fr-GY" data_files: - split: train path: "data/fr-GY/*.parquet" - config_name: "fr-HK" data_files: - split: train path: "data/fr-HK/*.parquet" - config_name: "fr-HR" data_files: - split: train path: "data/fr-HR/*.parquet" - config_name: "fr-HT" data_files: - split: train path: "data/fr-HT/*.parquet" - config_name: "fr-HU" data_files: - split: train path: "data/fr-HU/*.parquet" - config_name: "fr-ID" data_files: - split: train path: "data/fr-ID/*.parquet" - config_name: "fr-IE" data_files: - split: train path: "data/fr-IE/*.parquet" - config_name: "fr-IL" data_files: - split: train path: "data/fr-IL/*.parquet" - config_name: "fr-IM" data_files: - split: train path: "data/fr-IM/*.parquet" - config_name: "fr-IN" data_files: - split: train path: "data/fr-IN/*.parquet" - config_name: "fr-IO" data_files: - split: train path: "data/fr-IO/*.parquet" - config_name: "fr-IR" data_files: - split: train path: "data/fr-IR/*.parquet" - config_name: "fr-IS" data_files: - split: train path: "data/fr-IS/*.parquet" - config_name: "fr-IT" data_files: - split: train path: "data/fr-IT/*.parquet" - config_name: "fr-JO" data_files: - split: train path: "data/fr-JO/*.parquet" - config_name: "fr-JP" data_files: - split: train path: "data/fr-JP/*.parquet" - config_name: "fr-KE" data_files: - split: train path: "data/fr-KE/*.parquet" - config_name: "fr-KG" data_files: - split: train path: "data/fr-KG/*.parquet" - config_name: "fr-KH" data_files: - split: train path: "data/fr-KH/*.parquet" - config_name: "fr-KM" data_files: - split: train path: "data/fr-KM/*.parquet" - config_name: "fr-KP" data_files: - split: train path: "data/fr-KP/*.parquet" - config_name: "fr-KR" data_files: - split: train path: "data/fr-KR/*.parquet" - config_name: "fr-KW" data_files: - split: train path: "data/fr-KW/*.parquet" - config_name: "fr-KZ" data_files: - split: train path: "data/fr-KZ/*.parquet" - config_name: "fr-LA" data_files: - split: train path: "data/fr-LA/*.parquet" - config_name: "fr-LB" data_files: - split: train path: "data/fr-LB/*.parquet" - config_name: "fr-LI" data_files: - split: train path: "data/fr-LI/*.parquet" - config_name: "fr-LT" data_files: - split: train path: "data/fr-LT/*.parquet" - config_name: "fr-LU" data_files: - split: train path: "data/fr-LU/*.parquet" - config_name: "fr-LV" data_files: - split: train path: "data/fr-LV/*.parquet" - config_name: "fr-LY" data_files: - split: train path: "data/fr-LY/*.parquet" - config_name: "fr-MA" data_files: - split: train path: "data/fr-MA/*.parquet" - config_name: "fr-MC" data_files: - split: train path: "data/fr-MC/*.parquet" - config_name: "fr-MD" data_files: - split: train path: "data/fr-MD/*.parquet" - config_name: "fr-ME" data_files: - split: train path: "data/fr-ME/*.parquet" - config_name: "fr-MG" data_files: - split: train path: "data/fr-MG/*.parquet" - config_name: "fr-MK" data_files: - split: train path: "data/fr-MK/*.parquet" - config_name: "fr-ML" data_files: - split: train path: "data/fr-ML/*.parquet" - config_name: "fr-MN" data_files: - split: train path: "data/fr-MN/*.parquet" - config_name: "fr-MQ" data_files: - split: train path: "data/fr-MQ/*.parquet" - config_name: "fr-MR" data_files: - split: train path: "data/fr-MR/*.parquet" - config_name: "fr-MS" data_files: - split: train path: "data/fr-MS/*.parquet" - config_name: "fr-MT" data_files: - split: train path: "data/fr-MT/*.parquet" - config_name: "fr-MU" data_files: - split: train path: "data/fr-MU/*.parquet" - config_name: "fr-MX" data_files: - split: train path: "data/fr-MX/*.parquet" - config_name: "fr-MY" data_files: - split: train path: "data/fr-MY/*.parquet" - config_name: "fr-MZ" data_files: - split: train path: "data/fr-MZ/*.parquet" - config_name: "fr-NC" data_files: - split: train path: "data/fr-NC/*.parquet" - config_name: "fr-NE" data_files: - split: train path: "data/fr-NE/*.parquet" - config_name: "fr-NF" data_files: - split: train path: "data/fr-NF/*.parquet" - config_name: "fr-NG" data_files: - split: train path: "data/fr-NG/*.parquet" - config_name: "fr-NL" data_files: - split: train path: "data/fr-NL/*.parquet" - config_name: "fr-NO" data_files: - split: train path: "data/fr-NO/*.parquet" - config_name: "fr-NU" data_files: - split: train path: "data/fr-NU/*.parquet" - config_name: "fr-NZ" data_files: - split: train path: "data/fr-NZ/*.parquet" - config_name: "fr-OC" data_files: - split: train path: "data/fr-OC/*.parquet" - config_name: "fr-OM" data_files: - split: train path: "data/fr-OM/*.parquet" - config_name: "fr-PE" data_files: - split: train path: "data/fr-PE/*.parquet" - config_name: "fr-PF" data_files: - split: train path: "data/fr-PF/*.parquet" - config_name: "fr-PH" data_files: - split: train path: "data/fr-PH/*.parquet" - config_name: "fr-PK" data_files: - split: train path: "data/fr-PK/*.parquet" - config_name: "fr-PL" data_files: - split: train path: "data/fr-PL/*.parquet" - config_name: "fr-PM" data_files: - split: train path: "data/fr-PM/*.parquet" - config_name: "fr-PS" data_files: - split: train path: "data/fr-PS/*.parquet" - config_name: "fr-PT" data_files: - split: train path: "data/fr-PT/*.parquet" - config_name: "fr-PW" data_files: - split: train path: "data/fr-PW/*.parquet" - config_name: "fr-QC" data_files: - split: train path: "data/fr-QC/*.parquet" - config_name: "fr-RE" data_files: - split: train path: "data/fr-RE/*.parquet" - config_name: "fr-RO" data_files: - split: train path: "data/fr-RO/*.parquet" - config_name: "fr-RS" data_files: - split: train path: "data/fr-RS/*.parquet" - config_name: "fr-RT" data_files: - split: train path: "data/fr-RT/*.parquet" - config_name: "fr-RU" data_files: - split: train path: "data/fr-RU/*.parquet" - config_name: "fr-RW" data_files: - split: train path: "data/fr-RW/*.parquet" - config_name: "fr-SA" data_files: - split: train path: "data/fr-SA/*.parquet" - config_name: "fr-SC" data_files: - split: train path: "data/fr-SC/*.parquet" - config_name: "fr-SE" data_files: - split: train path: "data/fr-SE/*.parquet" - config_name: "fr-SG" data_files: - split: train path: "data/fr-SG/*.parquet" - config_name: "fr-SH" data_files: - split: train path: "data/fr-SH/*.parquet" - config_name: "fr-SI" data_files: - split: train path: "data/fr-SI/*.parquet" - config_name: "fr-SK" data_files: - split: train path: "data/fr-SK/*.parquet" - config_name: "fr-SM" data_files: - split: train path: "data/fr-SM/*.parquet" - config_name: "fr-SN" data_files: - split: train path: "data/fr-SN/*.parquet" - config_name: "fr-SO" data_files: - split: train path: "data/fr-SO/*.parquet" - config_name: "fr-ST" data_files: - split: train path: "data/fr-ST/*.parquet" - config_name: "fr-SV" data_files: - split: train path: "data/fr-SV/*.parquet" - config_name: "fr-SX" data_files: - split: train path: "data/fr-SX/*.parquet" - config_name: "fr-SY" data_files: - split: train path: "data/fr-SY/*.parquet" - config_name: "fr-TG" data_files: - split: train path: "data/fr-TG/*.parquet" - config_name: "fr-TH" data_files: - split: train path: "data/fr-TH/*.parquet" - config_name: "fr-TK" data_files: - split: train path: "data/fr-TK/*.parquet" - config_name: "fr-TL" data_files: - split: train path: "data/fr-TL/*.parquet" - config_name: "fr-TN" data_files: - split: train path: "data/fr-TN/*.parquet" - config_name: "fr-TO" data_files: - split: train path: "data/fr-TO/*.parquet" - config_name: "fr-TR" data_files: - split: train path: "data/fr-TR/*.parquet" - config_name: "fr-TT" data_files: - split: train path: "data/fr-TT/*.parquet" - config_name: "fr-TU" data_files: - split: train path: "data/fr-TU/*.parquet" - config_name: "fr-TV" data_files: - split: train path: "data/fr-TV/*.parquet" - config_name: "fr-TW" data_files: - split: train path: "data/fr-TW/*.parquet" - config_name: "fr-TZ" data_files: - split: train path: "data/fr-TZ/*.parquet" - config_name: "fr-UA" data_files: - split: train path: "data/fr-UA/*.parquet" - config_name: "fr-UG" data_files: - split: train path: "data/fr-UG/*.parquet" - config_name: "fr-UK" data_files: - split: train path: "data/fr-UK/*.parquet" - config_name: "fr-US" data_files: - split: train path: "data/fr-US/*.parquet" - config_name: "fr-UY" data_files: - split: train path: "data/fr-UY/*.parquet" - config_name: "fr-UZ" data_files: - split: train path: "data/fr-UZ/*.parquet" - config_name: "fr-VA" data_files: - split: train path: "data/fr-VA/*.parquet" - config_name: "fr-VE" data_files: - split: train path: "data/fr-VE/*.parquet" - config_name: "fr-VL" data_files: - split: train path: "data/fr-VL/*.parquet" - config_name: "fr-VN" data_files: - split: train path: "data/fr-VN/*.parquet" - config_name: "fr-VU" data_files: - split: train path: "data/fr-VU/*.parquet" - config_name: "fr-WF" data_files: - split: train path: "data/fr-WF/*.parquet" - config_name: "fr-WS" data_files: - split: train path: "data/fr-WS/*.parquet" - config_name: "fr-XL" data_files: - split: train path: "data/fr-XL/*.parquet" - config_name: "fr-XX" data_files: - split: train path: "data/fr-XX/*.parquet" - config_name: "fr-YT" data_files: - split: train path: "data/fr-YT/*.parquet" - config_name: "fr-ZA" data_files: - split: train path: "data/fr-ZA/*.parquet" - config_name: "fr-ZW" data_files: - split: train path: "data/fr-ZW/*.parquet" - config_name: "fr-ZZ" data_files: - split: train path: "data/fr-ZZ/*.parquet" - config_name: "fy-BR" data_files: - split: train path: "data/fy-BR/*.parquet" - config_name: "fy-EU" data_files: - split: train path: "data/fy-EU/*.parquet" - config_name: "fy-FR" data_files: - split: train path: "data/fy-FR/*.parquet" - config_name: "fy-IO" data_files: - split: train path: "data/fy-IO/*.parquet" - config_name: "fy-NL" data_files: - split: train path: "data/fy-NL/*.parquet" - config_name: "fy-PL" data_files: - split: train path: "data/fy-PL/*.parquet" - config_name: "fy-US" data_files: - split: train path: "data/fy-US/*.parquet" - config_name: "fy-XX" data_files: - split: train path: "data/fy-XX/*.parquet" - config_name: "ga-DE" data_files: - split: train path: "data/ga-DE/*.parquet" - config_name: "ga-EU" data_files: - split: train path: "data/ga-EU/*.parquet" - config_name: "ga-GA" data_files: - split: train path: "data/ga-GA/*.parquet" - config_name: "ga-GB" data_files: - split: train path: "data/ga-GB/*.parquet" - config_name: "ga-IE" data_files: - split: train path: "data/ga-IE/*.parquet" - config_name: "ga-PW" data_files: - split: train path: "data/ga-PW/*.parquet" - config_name: "ga-US" data_files: - split: train path: "data/ga-US/*.parquet" - config_name: "ga-XX" data_files: - split: train path: "data/ga-XX/*.parquet" - config_name: "gd-GB" data_files: - split: train path: "data/gd-GB/*.parquet" - config_name: "gd-XX" data_files: - split: train path: "data/gd-XX/*.parquet" - config_name: "gl-CO" data_files: - split: train path: "data/gl-CO/*.parquet" - config_name: "gl-ES" data_files: - split: train path: "data/gl-ES/*.parquet" - config_name: "gl-EU" data_files: - split: train path: "data/gl-EU/*.parquet" - config_name: "gl-FR" data_files: - split: train path: "data/gl-FR/*.parquet" - config_name: "gl-GB" data_files: - split: train path: "data/gl-GB/*.parquet" - config_name: "gl-GL" data_files: - split: train path: "data/gl-GL/*.parquet" - config_name: "gl-PT" data_files: - split: train path: "data/gl-PT/*.parquet" - config_name: "gl-RU" data_files: - split: train path: "data/gl-RU/*.parquet" - config_name: "gl-TV" data_files: - split: train path: "data/gl-TV/*.parquet" - config_name: "gl-US" data_files: - split: train path: "data/gl-US/*.parquet" - config_name: "gl-XX" data_files: - split: train path: "data/gl-XX/*.parquet" - config_name: "gn-XX" data_files: - split: train path: "data/gn-XX/*.parquet" - config_name: "gu-GB" data_files: - split: train path: "data/gu-GB/*.parquet" - config_name: "gu-IN" data_files: - split: train path: "data/gu-IN/*.parquet" - config_name: "gu-US" data_files: - split: train path: "data/gu-US/*.parquet" - config_name: "gu-XX" data_files: - split: train path: "data/gu-XX/*.parquet" - config_name: "he-AE" data_files: - split: train path: "data/he-AE/*.parquet" - config_name: "he-AL" data_files: - split: train path: "data/he-AL/*.parquet" - config_name: "he-AR" data_files: - split: train path: "data/he-AR/*.parquet" - config_name: "he-AU" data_files: - split: train path: "data/he-AU/*.parquet" - config_name: "he-BE" data_files: - split: train path: "data/he-BE/*.parquet" - config_name: "he-BR" data_files: - split: train path: "data/he-BR/*.parquet" - config_name: "he-BZ" data_files: - split: train path: "data/he-BZ/*.parquet" - config_name: "he-CA" data_files: - split: train path: "data/he-CA/*.parquet" - config_name: "he-CN" data_files: - split: train path: "data/he-CN/*.parquet" - config_name: "he-CO" data_files: - split: train path: "data/he-CO/*.parquet" - config_name: "he-CZ" data_files: - split: train path: "data/he-CZ/*.parquet" - config_name: "he-DE" data_files: - split: train path: "data/he-DE/*.parquet" - config_name: "he-DK" data_files: - split: train path: "data/he-DK/*.parquet" - config_name: "he-EE" data_files: - split: train path: "data/he-EE/*.parquet" - config_name: "he-EN" data_files: - split: train path: "data/he-EN/*.parquet" - config_name: "he-ES" data_files: - split: train path: "data/he-ES/*.parquet" - config_name: "he-EU" data_files: - split: train path: "data/he-EU/*.parquet" - config_name: "he-FM" data_files: - split: train path: "data/he-FM/*.parquet" - config_name: "he-FR" data_files: - split: train path: "data/he-FR/*.parquet" - config_name: "he-GB" data_files: - split: train path: "data/he-GB/*.parquet" - config_name: "he-GR" data_files: - split: train path: "data/he-GR/*.parquet" - config_name: "he-HE" data_files: - split: train path: "data/he-HE/*.parquet" - config_name: "he-HK" data_files: - split: train path: "data/he-HK/*.parquet" - config_name: "he-IL" data_files: - split: train path: "data/he-IL/*.parquet" - config_name: "he-IO" data_files: - split: train path: "data/he-IO/*.parquet" - config_name: "he-IS" data_files: - split: train path: "data/he-IS/*.parquet" - config_name: "he-IT" data_files: - split: train path: "data/he-IT/*.parquet" - config_name: "he-IW" data_files: - split: train path: "data/he-IW/*.parquet" - config_name: "he-JP" data_files: - split: train path: "data/he-JP/*.parquet" - config_name: "he-LY" data_files: - split: train path: "data/he-LY/*.parquet" - config_name: "he-ME" data_files: - split: train path: "data/he-ME/*.parquet" - config_name: "he-MY" data_files: - split: train path: "data/he-MY/*.parquet" - config_name: "he-PL" data_files: - split: train path: "data/he-PL/*.parquet" - config_name: "he-PW" data_files: - split: train path: "data/he-PW/*.parquet" - config_name: "he-RO" data_files: - split: train path: "data/he-RO/*.parquet" - config_name: "he-RU" data_files: - split: train path: "data/he-RU/*.parquet" - config_name: "he-SE" data_files: - split: train path: "data/he-SE/*.parquet" - config_name: "he-SY" data_files: - split: train path: "data/he-SY/*.parquet" - config_name: "he-TK" data_files: - split: train path: "data/he-TK/*.parquet" - config_name: "he-TV" data_files: - split: train path: "data/he-TV/*.parquet" - config_name: "he-UA" data_files: - split: train path: "data/he-UA/*.parquet" - config_name: "he-US" data_files: - split: train path: "data/he-US/*.parquet" - config_name: "he-UZ" data_files: - split: train path: "data/he-UZ/*.parquet" - config_name: "he-WS" data_files: - split: train path: "data/he-WS/*.parquet" - config_name: "he-XX" data_files: - split: train path: "data/he-XX/*.parquet" - config_name: "hi-AI" data_files: - split: train path: "data/hi-AI/*.parquet" - config_name: "hi-CA" data_files: - split: train path: "data/hi-CA/*.parquet" - config_name: "hi-CN" data_files: - split: train path: "data/hi-CN/*.parquet" - config_name: "hi-EU" data_files: - split: train path: "data/hi-EU/*.parquet" - config_name: "hi-GB" data_files: - split: train path: "data/hi-GB/*.parquet" - config_name: "hi-HI" data_files: - split: train path: "data/hi-HI/*.parquet" - config_name: "hi-IN" data_files: - split: train path: "data/hi-IN/*.parquet" - config_name: "hi-IO" data_files: - split: train path: "data/hi-IO/*.parquet" - config_name: "hi-JA" data_files: - split: train path: "data/hi-JA/*.parquet" - config_name: "hi-ME" data_files: - split: train path: "data/hi-ME/*.parquet" - config_name: "hi-ML" data_files: - split: train path: "data/hi-ML/*.parquet" - config_name: "hi-TV" data_files: - split: train path: "data/hi-TV/*.parquet" - config_name: "hi-US" data_files: - split: train path: "data/hi-US/*.parquet" - config_name: "hi-XX" data_files: - split: train path: "data/hi-XX/*.parquet" - config_name: "hr-BA" data_files: - split: train path: "data/hr-BA/*.parquet" - config_name: "hr-CA" data_files: - split: train path: "data/hr-CA/*.parquet" - config_name: "hr-CC" data_files: - split: train path: "data/hr-CC/*.parquet" - config_name: "hr-CN" data_files: - split: train path: "data/hr-CN/*.parquet" - config_name: "hr-CZ" data_files: - split: train path: "data/hr-CZ/*.parquet" - config_name: "hr-DE" data_files: - split: train path: "data/hr-DE/*.parquet" - config_name: "hr-EU" data_files: - split: train path: "data/hr-EU/*.parquet" - config_name: "hr-FR" data_files: - split: train path: "data/hr-FR/*.parquet" - config_name: "hr-GB" data_files: - split: train path: "data/hr-GB/*.parquet" - config_name: "hr-HR" data_files: - split: train path: "data/hr-HR/*.parquet" - config_name: "hr-HU" data_files: - split: train path: "data/hr-HU/*.parquet" - config_name: "hr-IE" data_files: - split: train path: "data/hr-IE/*.parquet" - config_name: "hr-IN" data_files: - split: train path: "data/hr-IN/*.parquet" - config_name: "hr-IO" data_files: - split: train path: "data/hr-IO/*.parquet" - config_name: "hr-IT" data_files: - split: train path: "data/hr-IT/*.parquet" - config_name: "hr-ME" data_files: - split: train path: "data/hr-ME/*.parquet" - config_name: "hr-PL" data_files: - split: train path: "data/hr-PL/*.parquet" - config_name: "hr-PT" data_files: - split: train path: "data/hr-PT/*.parquet" - config_name: "hr-RS" data_files: - split: train path: "data/hr-RS/*.parquet" - config_name: "hr-RU" data_files: - split: train path: "data/hr-RU/*.parquet" - config_name: "hr-SI" data_files: - split: train path: "data/hr-SI/*.parquet" - config_name: "hr-TV" data_files: - split: train path: "data/hr-TV/*.parquet" - config_name: "hr-US" data_files: - split: train path: "data/hr-US/*.parquet" - config_name: "hr-VA" data_files: - split: train path: "data/hr-VA/*.parquet" - config_name: "hr-WS" data_files: - split: train path: "data/hr-WS/*.parquet" - config_name: "hr-XX" data_files: - split: train path: "data/hr-XX/*.parquet" - config_name: "hsb-DE" data_files: - split: train path: "data/hsb-DE/*.parquet" - config_name: "hsb-HS" data_files: - split: train path: "data/hsb-HS/*.parquet" - config_name: "hsb-SB" data_files: - split: train path: "data/hsb-SB/*.parquet" - config_name: "hsb-US" data_files: - split: train path: "data/hsb-US/*.parquet" - config_name: "hsb-XX" data_files: - split: train path: "data/hsb-XX/*.parquet" - config_name: "hu-AI" data_files: - split: train path: "data/hu-AI/*.parquet" - config_name: "hu-AM" data_files: - split: train path: "data/hu-AM/*.parquet" - config_name: "hu-AR" data_files: - split: train path: "data/hu-AR/*.parquet" - config_name: "hu-AT" data_files: - split: train path: "data/hu-AT/*.parquet" - config_name: "hu-AU" data_files: - split: train path: "data/hu-AU/*.parquet" - config_name: "hu-BE" data_files: - split: train path: "data/hu-BE/*.parquet" - config_name: "hu-BG" data_files: - split: train path: "data/hu-BG/*.parquet" - config_name: "hu-BM" data_files: - split: train path: "data/hu-BM/*.parquet" - config_name: "hu-BR" data_files: - split: train path: "data/hu-BR/*.parquet" - config_name: "hu-CA" data_files: - split: train path: "data/hu-CA/*.parquet" - config_name: "hu-CC" data_files: - split: train path: "data/hu-CC/*.parquet" - config_name: "hu-CF" data_files: - split: train path: "data/hu-CF/*.parquet" - config_name: "hu-CH" data_files: - split: train path: "data/hu-CH/*.parquet" - config_name: "hu-CN" data_files: - split: train path: "data/hu-CN/*.parquet" - config_name: "hu-CO" data_files: - split: train path: "data/hu-CO/*.parquet" - config_name: "hu-CZ" data_files: - split: train path: "data/hu-CZ/*.parquet" - config_name: "hu-DE" data_files: - split: train path: "data/hu-DE/*.parquet" - config_name: "hu-DK" data_files: - split: train path: "data/hu-DK/*.parquet" - config_name: "hu-EE" data_files: - split: train path: "data/hu-EE/*.parquet" - config_name: "hu-EN" data_files: - split: train path: "data/hu-EN/*.parquet" - config_name: "hu-ES" data_files: - split: train path: "data/hu-ES/*.parquet" - config_name: "hu-EU" data_files: - split: train path: "data/hu-EU/*.parquet" - config_name: "hu-FI" data_files: - split: train path: "data/hu-FI/*.parquet" - config_name: "hu-FM" data_files: - split: train path: "data/hu-FM/*.parquet" - config_name: "hu-FR" data_files: - split: train path: "data/hu-FR/*.parquet" - config_name: "hu-GA" data_files: - split: train path: "data/hu-GA/*.parquet" - config_name: "hu-GB" data_files: - split: train path: "data/hu-GB/*.parquet" - config_name: "hu-GL" data_files: - split: train path: "data/hu-GL/*.parquet" - config_name: "hu-GQ" data_files: - split: train path: "data/hu-GQ/*.parquet" - config_name: "hu-GR" data_files: - split: train path: "data/hu-GR/*.parquet" - config_name: "hu-HR" data_files: - split: train path: "data/hu-HR/*.parquet" - config_name: "hu-HU" data_files: - split: train path: "data/hu-HU/*.parquet" - config_name: "hu-ID" data_files: - split: train path: "data/hu-ID/*.parquet" - config_name: "hu-IE" data_files: - split: train path: "data/hu-IE/*.parquet" - config_name: "hu-IN" data_files: - split: train path: "data/hu-IN/*.parquet" - config_name: "hu-IO" data_files: - split: train path: "data/hu-IO/*.parquet" - config_name: "hu-IT" data_files: - split: train path: "data/hu-IT/*.parquet" - config_name: "hu-JP" data_files: - split: train path: "data/hu-JP/*.parquet" - config_name: "hu-KR" data_files: - split: train path: "data/hu-KR/*.parquet" - config_name: "hu-LA" data_files: - split: train path: "data/hu-LA/*.parquet" - config_name: "hu-LT" data_files: - split: train path: "data/hu-LT/*.parquet" - config_name: "hu-LU" data_files: - split: train path: "data/hu-LU/*.parquet" - config_name: "hu-MA" data_files: - split: train path: "data/hu-MA/*.parquet" - config_name: "hu-ME" data_files: - split: train path: "data/hu-ME/*.parquet" - config_name: "hu-ML" data_files: - split: train path: "data/hu-ML/*.parquet" - config_name: "hu-NL" data_files: - split: train path: "data/hu-NL/*.parquet" - config_name: "hu-PL" data_files: - split: train path: "data/hu-PL/*.parquet" - config_name: "hu-PT" data_files: - split: train path: "data/hu-PT/*.parquet" - config_name: "hu-PW" data_files: - split: train path: "data/hu-PW/*.parquet" - config_name: "hu-QA" data_files: - split: train path: "data/hu-QA/*.parquet" - config_name: "hu-RO" data_files: - split: train path: "data/hu-RO/*.parquet" - config_name: "hu-RS" data_files: - split: train path: "data/hu-RS/*.parquet" - config_name: "hu-RU" data_files: - split: train path: "data/hu-RU/*.parquet" - config_name: "hu-SE" data_files: - split: train path: "data/hu-SE/*.parquet" - config_name: "hu-SI" data_files: - split: train path: "data/hu-SI/*.parquet" - config_name: "hu-SK" data_files: - split: train path: "data/hu-SK/*.parquet" - config_name: "hu-TK" data_files: - split: train path: "data/hu-TK/*.parquet" - config_name: "hu-TO" data_files: - split: train path: "data/hu-TO/*.parquet" - config_name: "hu-TR" data_files: - split: train path: "data/hu-TR/*.parquet" - config_name: "hu-TV" data_files: - split: train path: "data/hu-TV/*.parquet" - config_name: "hu-UA" data_files: - split: train path: "data/hu-UA/*.parquet" - config_name: "hu-US" data_files: - split: train path: "data/hu-US/*.parquet" - config_name: "hu-VA" data_files: - split: train path: "data/hu-VA/*.parquet" - config_name: "hu-VN" data_files: - split: train path: "data/hu-VN/*.parquet" - config_name: "hu-WS" data_files: - split: train path: "data/hu-WS/*.parquet" - config_name: "hu-XX" data_files: - split: train path: "data/hu-XX/*.parquet" - config_name: "hu-YU" data_files: - split: train path: "data/hu-YU/*.parquet" - config_name: "hu-ZA" data_files: - split: train path: "data/hu-ZA/*.parquet" - config_name: "hu-ZM" data_files: - split: train path: "data/hu-ZM/*.parquet" - config_name: "hy-AM" data_files: - split: train path: "data/hy-AM/*.parquet" - config_name: "hy-AR" data_files: - split: train path: "data/hy-AR/*.parquet" - config_name: "hy-AU" data_files: - split: train path: "data/hy-AU/*.parquet" - config_name: "hy-AZ" data_files: - split: train path: "data/hy-AZ/*.parquet" - config_name: "hy-CA" data_files: - split: train path: "data/hy-CA/*.parquet" - config_name: "hy-CH" data_files: - split: train path: "data/hy-CH/*.parquet" - config_name: "hy-CO" data_files: - split: train path: "data/hy-CO/*.parquet" - config_name: "hy-DE" data_files: - split: train path: "data/hy-DE/*.parquet" - config_name: "hy-EN" data_files: - split: train path: "data/hy-EN/*.parquet" - config_name: "hy-ET" data_files: - split: train path: "data/hy-ET/*.parquet" - config_name: "hy-EU" data_files: - split: train path: "data/hy-EU/*.parquet" - config_name: "hy-FM" data_files: - split: train path: "data/hy-FM/*.parquet" - config_name: "hy-FR" data_files: - split: train path: "data/hy-FR/*.parquet" - config_name: "hy-GB" data_files: - split: train path: "data/hy-GB/*.parquet" - config_name: "hy-GE" data_files: - split: train path: "data/hy-GE/*.parquet" - config_name: "hy-GR" data_files: - split: train path: "data/hy-GR/*.parquet" - config_name: "hy-HU" data_files: - split: train path: "data/hy-HU/*.parquet" - config_name: "hy-HY" data_files: - split: train path: "data/hy-HY/*.parquet" - config_name: "hy-IM" data_files: - split: train path: "data/hy-IM/*.parquet" - config_name: "hy-IR" data_files: - split: train path: "data/hy-IR/*.parquet" - config_name: "hy-IT" data_files: - split: train path: "data/hy-IT/*.parquet" - config_name: "hy-LB" data_files: - split: train path: "data/hy-LB/*.parquet" - config_name: "hy-ME" data_files: - split: train path: "data/hy-ME/*.parquet" - config_name: "hy-NL" data_files: - split: train path: "data/hy-NL/*.parquet" - config_name: "hy-PT" data_files: - split: train path: "data/hy-PT/*.parquet" - config_name: "hy-QA" data_files: - split: train path: "data/hy-QA/*.parquet" - config_name: "hy-RU" data_files: - split: train path: "data/hy-RU/*.parquet" - config_name: "hy-TR" data_files: - split: train path: "data/hy-TR/*.parquet" - config_name: "hy-TV" data_files: - split: train path: "data/hy-TV/*.parquet" - config_name: "hy-UA" data_files: - split: train path: "data/hy-UA/*.parquet" - config_name: "hy-US" data_files: - split: train path: "data/hy-US/*.parquet" - config_name: "hy-XX" data_files: - split: train path: "data/hy-XX/*.parquet" - config_name: "ia-IT" data_files: - split: train path: "data/ia-IT/*.parquet" - config_name: "ia-XX" data_files: - split: train path: "data/ia-XX/*.parquet" - config_name: "id-AA" data_files: - split: train path: "data/id-AA/*.parquet" - config_name: "id-AE" data_files: - split: train path: "data/id-AE/*.parquet" - config_name: "id-AI" data_files: - split: train path: "data/id-AI/*.parquet" - config_name: "id-AM" data_files: - split: train path: "data/id-AM/*.parquet" - config_name: "id-AR" data_files: - split: train path: "data/id-AR/*.parquet" - config_name: "id-AT" data_files: - split: train path: "data/id-AT/*.parquet" - config_name: "id-AU" data_files: - split: train path: "data/id-AU/*.parquet" - config_name: "id-BA" data_files: - split: train path: "data/id-BA/*.parquet" - config_name: "id-BE" data_files: - split: train path: "data/id-BE/*.parquet" - config_name: "id-BN" data_files: - split: train path: "data/id-BN/*.parquet" - config_name: "id-BO" data_files: - split: train path: "data/id-BO/*.parquet" - config_name: "id-BR" data_files: - split: train path: "data/id-BR/*.parquet" - config_name: "id-BZ" data_files: - split: train path: "data/id-BZ/*.parquet" - config_name: "id-CA" data_files: - split: train path: "data/id-CA/*.parquet" - config_name: "id-CC" data_files: - split: train path: "data/id-CC/*.parquet" - config_name: "id-CF" data_files: - split: train path: "data/id-CF/*.parquet" - config_name: "id-CH" data_files: - split: train path: "data/id-CH/*.parquet" - config_name: "id-CL" data_files: - split: train path: "data/id-CL/*.parquet" - config_name: "id-CM" data_files: - split: train path: "data/id-CM/*.parquet" - config_name: "id-CN" data_files: - split: train path: "data/id-CN/*.parquet" - config_name: "id-CO" data_files: - split: train path: "data/id-CO/*.parquet" - config_name: "id-CZ" data_files: - split: train path: "data/id-CZ/*.parquet" - config_name: "id-DE" data_files: - split: train path: "data/id-DE/*.parquet" - config_name: "id-DK" data_files: - split: train path: "data/id-DK/*.parquet" - config_name: "id-DO" data_files: - split: train path: "data/id-DO/*.parquet" - config_name: "id-EN" data_files: - split: train path: "data/id-EN/*.parquet" - config_name: "id-ES" data_files: - split: train path: "data/id-ES/*.parquet" - config_name: "id-EU" data_files: - split: train path: "data/id-EU/*.parquet" - config_name: "id-FI" data_files: - split: train path: "data/id-FI/*.parquet" - config_name: "id-FM" data_files: - split: train path: "data/id-FM/*.parquet" - config_name: "id-FR" data_files: - split: train path: "data/id-FR/*.parquet" - config_name: "id-GA" data_files: - split: train path: "data/id-GA/*.parquet" - config_name: "id-GB" data_files: - split: train path: "data/id-GB/*.parquet" - config_name: "id-GE" data_files: - split: train path: "data/id-GE/*.parquet" - config_name: "id-GG" data_files: - split: train path: "data/id-GG/*.parquet" - config_name: "id-GQ" data_files: - split: train path: "data/id-GQ/*.parquet" - config_name: "id-GT" data_files: - split: train path: "data/id-GT/*.parquet" - config_name: "id-HR" data_files: - split: train path: "data/id-HR/*.parquet" - config_name: "id-HU" data_files: - split: train path: "data/id-HU/*.parquet" - config_name: "id-ID" data_files: - split: train path: "data/id-ID/*.parquet" - config_name: "id-IE" data_files: - split: train path: "data/id-IE/*.parquet" - config_name: "id-IN" data_files: - split: train path: "data/id-IN/*.parquet" - config_name: "id-IO" data_files: - split: train path: "data/id-IO/*.parquet" - config_name: "id-IR" data_files: - split: train path: "data/id-IR/*.parquet" - config_name: "id-IS" data_files: - split: train path: "data/id-IS/*.parquet" - config_name: "id-IT" data_files: - split: train path: "data/id-IT/*.parquet" - config_name: "id-JP" data_files: - split: train path: "data/id-JP/*.parquet" - config_name: "id-KE" data_files: - split: train path: "data/id-KE/*.parquet" - config_name: "id-KR" data_files: - split: train path: "data/id-KR/*.parquet" - config_name: "id-LA" data_files: - split: train path: "data/id-LA/*.parquet" - config_name: "id-LC" data_files: - split: train path: "data/id-LC/*.parquet" - config_name: "id-LI" data_files: - split: train path: "data/id-LI/*.parquet" - config_name: "id-LK" data_files: - split: train path: "data/id-LK/*.parquet" - config_name: "id-LT" data_files: - split: train path: "data/id-LT/*.parquet" - config_name: "id-LV" data_files: - split: train path: "data/id-LV/*.parquet" - config_name: "id-LY" data_files: - split: train path: "data/id-LY/*.parquet" - config_name: "id-MA" data_files: - split: train path: "data/id-MA/*.parquet" - config_name: "id-ME" data_files: - split: train path: "data/id-ME/*.parquet" - config_name: "id-ML" data_files: - split: train path: "data/id-ML/*.parquet" - config_name: "id-MO" data_files: - split: train path: "data/id-MO/*.parquet" - config_name: "id-MS" data_files: - split: train path: "data/id-MS/*.parquet" - config_name: "id-MX" data_files: - split: train path: "data/id-MX/*.parquet" - config_name: "id-MY" data_files: - split: train path: "data/id-MY/*.parquet" - config_name: "id-NG" data_files: - split: train path: "data/id-NG/*.parquet" - config_name: "id-NL" data_files: - split: train path: "data/id-NL/*.parquet" - config_name: "id-NO" data_files: - split: train path: "data/id-NO/*.parquet" - config_name: "id-NU" data_files: - split: train path: "data/id-NU/*.parquet" - config_name: "id-NZ" data_files: - split: train path: "data/id-NZ/*.parquet" - config_name: "id-PE" data_files: - split: train path: "data/id-PE/*.parquet" - config_name: "id-PK" data_files: - split: train path: "data/id-PK/*.parquet" - config_name: "id-PL" data_files: - split: train path: "data/id-PL/*.parquet" - config_name: "id-PS" data_files: - split: train path: "data/id-PS/*.parquet" - config_name: "id-PT" data_files: - split: train path: "data/id-PT/*.parquet" - config_name: "id-PW" data_files: - split: train path: "data/id-PW/*.parquet" - config_name: "id-RO" data_files: - split: train path: "data/id-RO/*.parquet" - config_name: "id-RS" data_files: - split: train path: "data/id-RS/*.parquet" - config_name: "id-RU" data_files: - split: train path: "data/id-RU/*.parquet" - config_name: "id-SA" data_files: - split: train path: "data/id-SA/*.parquet" - config_name: "id-SB" data_files: - split: train path: "data/id-SB/*.parquet" - config_name: "id-SE" data_files: - split: train path: "data/id-SE/*.parquet" - config_name: "id-SG" data_files: - split: train path: "data/id-SG/*.parquet" - config_name: "id-SH" data_files: - split: train path: "data/id-SH/*.parquet" - config_name: "id-SI" data_files: - split: train path: "data/id-SI/*.parquet" - config_name: "id-SK" data_files: - split: train path: "data/id-SK/*.parquet" - config_name: "id-SO" data_files: - split: train path: "data/id-SO/*.parquet" - config_name: "id-TH" data_files: - split: train path: "data/id-TH/*.parquet" - config_name: "id-TK" data_files: - split: train path: "data/id-TK/*.parquet" - config_name: "id-TL" data_files: - split: train path: "data/id-TL/*.parquet" - config_name: "id-TO" data_files: - split: train path: "data/id-TO/*.parquet" - config_name: "id-TR" data_files: - split: train path: "data/id-TR/*.parquet" - config_name: "id-TV" data_files: - split: train path: "data/id-TV/*.parquet" - config_name: "id-TW" data_files: - split: train path: "data/id-TW/*.parquet" - config_name: "id-UA" data_files: - split: train path: "data/id-UA/*.parquet" - config_name: "id-UN" data_files: - split: train path: "data/id-UN/*.parquet" - config_name: "id-US" data_files: - split: train path: "data/id-US/*.parquet" - config_name: "id-VN" data_files: - split: train path: "data/id-VN/*.parquet" - config_name: "id-VU" data_files: - split: train path: "data/id-VU/*.parquet" - config_name: "id-WS" data_files: - split: train path: "data/id-WS/*.parquet" - config_name: "id-XX" data_files: - split: train path: "data/id-XX/*.parquet" - config_name: "id-ZA" data_files: - split: train path: "data/id-ZA/*.parquet" - config_name: "ilo-US" data_files: - split: train path: "data/ilo-US/*.parquet" - config_name: "ilo-XX" data_files: - split: train path: "data/ilo-XX/*.parquet" - config_name: "io-XX" data_files: - split: train path: "data/io-XX/*.parquet" - config_name: "is-AI" data_files: - split: train path: "data/is-AI/*.parquet" - config_name: "is-CC" data_files: - split: train path: "data/is-CC/*.parquet" - config_name: "is-CH" data_files: - split: train path: "data/is-CH/*.parquet" - config_name: "is-CO" data_files: - split: train path: "data/is-CO/*.parquet" - config_name: "is-DK" data_files: - split: train path: "data/is-DK/*.parquet" - config_name: "is-EN" data_files: - split: train path: "data/is-EN/*.parquet" - config_name: "is-ES" data_files: - split: train path: "data/is-ES/*.parquet" - config_name: "is-EU" data_files: - split: train path: "data/is-EU/*.parquet" - config_name: "is-FI" data_files: - split: train path: "data/is-FI/*.parquet" - config_name: "is-FO" data_files: - split: train path: "data/is-FO/*.parquet" - config_name: "is-GB" data_files: - split: train path: "data/is-GB/*.parquet" - config_name: "is-IO" data_files: - split: train path: "data/is-IO/*.parquet" - config_name: "is-IS" data_files: - split: train path: "data/is-IS/*.parquet" - config_name: "is-NO" data_files: - split: train path: "data/is-NO/*.parquet" - config_name: "is-RU" data_files: - split: train path: "data/is-RU/*.parquet" - config_name: "is-SE" data_files: - split: train path: "data/is-SE/*.parquet" - config_name: "is-TK" data_files: - split: train path: "data/is-TK/*.parquet" - config_name: "is-TW" data_files: - split: train path: "data/is-TW/*.parquet" - config_name: "is-UA" data_files: - split: train path: "data/is-UA/*.parquet" - config_name: "is-US" data_files: - split: train path: "data/is-US/*.parquet" - config_name: "is-XX" data_files: - split: train path: "data/is-XX/*.parquet" - config_name: "it-AA" data_files: - split: train path: "data/it-AA/*.parquet" - config_name: "it-AD" data_files: - split: train path: "data/it-AD/*.parquet" - config_name: "it-AE" data_files: - split: train path: "data/it-AE/*.parquet" - config_name: "it-AI" data_files: - split: train path: "data/it-AI/*.parquet" - config_name: "it-AL" data_files: - split: train path: "data/it-AL/*.parquet" - config_name: "it-AM" data_files: - split: train path: "data/it-AM/*.parquet" - config_name: "it-AR" data_files: - split: train path: "data/it-AR/*.parquet" - config_name: "it-AS" data_files: - split: train path: "data/it-AS/*.parquet" - config_name: "it-AT" data_files: - split: train path: "data/it-AT/*.parquet" - config_name: "it-AU" data_files: - split: train path: "data/it-AU/*.parquet" - config_name: "it-AX" data_files: - split: train path: "data/it-AX/*.parquet" - config_name: "it-AZ" data_files: - split: train path: "data/it-AZ/*.parquet" - config_name: "it-BD" data_files: - split: train path: "data/it-BD/*.parquet" - config_name: "it-BE" data_files: - split: train path: "data/it-BE/*.parquet" - config_name: "it-BG" data_files: - split: train path: "data/it-BG/*.parquet" - config_name: "it-BM" data_files: - split: train path: "data/it-BM/*.parquet" - config_name: "it-BQ" data_files: - split: train path: "data/it-BQ/*.parquet" - config_name: "it-BR" data_files: - split: train path: "data/it-BR/*.parquet" - config_name: "it-BY" data_files: - split: train path: "data/it-BY/*.parquet" - config_name: "it-BZ" data_files: - split: train path: "data/it-BZ/*.parquet" - config_name: "it-CA" data_files: - split: train path: "data/it-CA/*.parquet" - config_name: "it-CC" data_files: - split: train path: "data/it-CC/*.parquet" - config_name: "it-CD" data_files: - split: train path: "data/it-CD/*.parquet" - config_name: "it-CF" data_files: - split: train path: "data/it-CF/*.parquet" - config_name: "it-CH" data_files: - split: train path: "data/it-CH/*.parquet" - config_name: "it-CI" data_files: - split: train path: "data/it-CI/*.parquet" - config_name: "it-CL" data_files: - split: train path: "data/it-CL/*.parquet" - config_name: "it-CN" data_files: - split: train path: "data/it-CN/*.parquet" - config_name: "it-CO" data_files: - split: train path: "data/it-CO/*.parquet" - config_name: "it-CR" data_files: - split: train path: "data/it-CR/*.parquet" - config_name: "it-CU" data_files: - split: train path: "data/it-CU/*.parquet" - config_name: "it-CX" data_files: - split: train path: "data/it-CX/*.parquet" - config_name: "it-CY" data_files: - split: train path: "data/it-CY/*.parquet" - config_name: "it-CZ" data_files: - split: train path: "data/it-CZ/*.parquet" - config_name: "it-DE" data_files: - split: train path: "data/it-DE/*.parquet" - config_name: "it-DJ" data_files: - split: train path: "data/it-DJ/*.parquet" - config_name: "it-DK" data_files: - split: train path: "data/it-DK/*.parquet" - config_name: "it-DO" data_files: - split: train path: "data/it-DO/*.parquet" - config_name: "it-EC" data_files: - split: train path: "data/it-EC/*.parquet" - config_name: "it-EE" data_files: - split: train path: "data/it-EE/*.parquet" - config_name: "it-EN" data_files: - split: train path: "data/it-EN/*.parquet" - config_name: "it-ES" data_files: - split: train path: "data/it-ES/*.parquet" - config_name: "it-ET" data_files: - split: train path: "data/it-ET/*.parquet" - config_name: "it-EU" data_files: - split: train path: "data/it-EU/*.parquet" - config_name: "it-FI" data_files: - split: train path: "data/it-FI/*.parquet" - config_name: "it-FM" data_files: - split: train path: "data/it-FM/*.parquet" - config_name: "it-FR" data_files: - split: train path: "data/it-FR/*.parquet" - config_name: "it-GA" data_files: - split: train path: "data/it-GA/*.parquet" - config_name: "it-GB" data_files: - split: train path: "data/it-GB/*.parquet" - config_name: "it-GE" data_files: - split: train path: "data/it-GE/*.parquet" - config_name: "it-GG" data_files: - split: train path: "data/it-GG/*.parquet" - config_name: "it-GQ" data_files: - split: train path: "data/it-GQ/*.parquet" - config_name: "it-GR" data_files: - split: train path: "data/it-GR/*.parquet" - config_name: "it-GT" data_files: - split: train path: "data/it-GT/*.parquet" - config_name: "it-HR" data_files: - split: train path: "data/it-HR/*.parquet" - config_name: "it-HU" data_files: - split: train path: "data/it-HU/*.parquet" - config_name: "it-ID" data_files: - split: train path: "data/it-ID/*.parquet" - config_name: "it-IE" data_files: - split: train path: "data/it-IE/*.parquet" - config_name: "it-IL" data_files: - split: train path: "data/it-IL/*.parquet" - config_name: "it-IM" data_files: - split: train path: "data/it-IM/*.parquet" - config_name: "it-IN" data_files: - split: train path: "data/it-IN/*.parquet" - config_name: "it-IO" data_files: - split: train path: "data/it-IO/*.parquet" - config_name: "it-IR" data_files: - split: train path: "data/it-IR/*.parquet" - config_name: "it-IS" data_files: - split: train path: "data/it-IS/*.parquet" - config_name: "it-IT" data_files: - split: train path: "data/it-IT/*.parquet" - config_name: "it-JA" data_files: - split: train path: "data/it-JA/*.parquet" - config_name: "it-JP" data_files: - split: train path: "data/it-JP/*.parquet" - config_name: "it-KE" data_files: - split: train path: "data/it-KE/*.parquet" - config_name: "it-KR" data_files: - split: train path: "data/it-KR/*.parquet" - config_name: "it-KZ" data_files: - split: train path: "data/it-KZ/*.parquet" - config_name: "it-LA" data_files: - split: train path: "data/it-LA/*.parquet" - config_name: "it-LI" data_files: - split: train path: "data/it-LI/*.parquet" - config_name: "it-LK" data_files: - split: train path: "data/it-LK/*.parquet" - config_name: "it-LT" data_files: - split: train path: "data/it-LT/*.parquet" - config_name: "it-LU" data_files: - split: train path: "data/it-LU/*.parquet" - config_name: "it-LV" data_files: - split: train path: "data/it-LV/*.parquet" - config_name: "it-LY" data_files: - split: train path: "data/it-LY/*.parquet" - config_name: "it-MA" data_files: - split: train path: "data/it-MA/*.parquet" - config_name: "it-MC" data_files: - split: train path: "data/it-MC/*.parquet" - config_name: "it-MD" data_files: - split: train path: "data/it-MD/*.parquet" - config_name: "it-ME" data_files: - split: train path: "data/it-ME/*.parquet" - config_name: "it-MK" data_files: - split: train path: "data/it-MK/*.parquet" - config_name: "it-ML" data_files: - split: train path: "data/it-ML/*.parquet" - config_name: "it-MT" data_files: - split: train path: "data/it-MT/*.parquet" - config_name: "it-MV" data_files: - split: train path: "data/it-MV/*.parquet" - config_name: "it-MX" data_files: - split: train path: "data/it-MX/*.parquet" - config_name: "it-NG" data_files: - split: train path: "data/it-NG/*.parquet" - config_name: "it-NL" data_files: - split: train path: "data/it-NL/*.parquet" - config_name: "it-NO" data_files: - split: train path: "data/it-NO/*.parquet" - config_name: "it-NP" data_files: - split: train path: "data/it-NP/*.parquet" - config_name: "it-NU" data_files: - split: train path: "data/it-NU/*.parquet" - config_name: "it-NZ" data_files: - split: train path: "data/it-NZ/*.parquet" - config_name: "it-PE" data_files: - split: train path: "data/it-PE/*.parquet" - config_name: "it-PH" data_files: - split: train path: "data/it-PH/*.parquet" - config_name: "it-PK" data_files: - split: train path: "data/it-PK/*.parquet" - config_name: "it-PL" data_files: - split: train path: "data/it-PL/*.parquet" - config_name: "it-PT" data_files: - split: train path: "data/it-PT/*.parquet" - config_name: "it-PU" data_files: - split: train path: "data/it-PU/*.parquet" - config_name: "it-PW" data_files: - split: train path: "data/it-PW/*.parquet" - config_name: "it-PY" data_files: - split: train path: "data/it-PY/*.parquet" - config_name: "it-QA" data_files: - split: train path: "data/it-QA/*.parquet" - config_name: "it-RE" data_files: - split: train path: "data/it-RE/*.parquet" - config_name: "it-RO" data_files: - split: train path: "data/it-RO/*.parquet" - config_name: "it-RS" data_files: - split: train path: "data/it-RS/*.parquet" - config_name: "it-RU" data_files: - split: train path: "data/it-RU/*.parquet" - config_name: "it-SA" data_files: - split: train path: "data/it-SA/*.parquet" - config_name: "it-SE" data_files: - split: train path: "data/it-SE/*.parquet" - config_name: "it-SG" data_files: - split: train path: "data/it-SG/*.parquet" - config_name: "it-SI" data_files: - split: train path: "data/it-SI/*.parquet" - config_name: "it-SK" data_files: - split: train path: "data/it-SK/*.parquet" - config_name: "it-SM" data_files: - split: train path: "data/it-SM/*.parquet" - config_name: "it-SO" data_files: - split: train path: "data/it-SO/*.parquet" - config_name: "it-ST" data_files: - split: train path: "data/it-ST/*.parquet" - config_name: "it-SV" data_files: - split: train path: "data/it-SV/*.parquet" - config_name: "it-TH" data_files: - split: train path: "data/it-TH/*.parquet" - config_name: "it-TK" data_files: - split: train path: "data/it-TK/*.parquet" - config_name: "it-TL" data_files: - split: train path: "data/it-TL/*.parquet" - config_name: "it-TN" data_files: - split: train path: "data/it-TN/*.parquet" - config_name: "it-TO" data_files: - split: train path: "data/it-TO/*.parquet" - config_name: "it-TR" data_files: - split: train path: "data/it-TR/*.parquet" - config_name: "it-TT" data_files: - split: train path: "data/it-TT/*.parquet" - config_name: "it-TV" data_files: - split: train path: "data/it-TV/*.parquet" - config_name: "it-TW" data_files: - split: train path: "data/it-TW/*.parquet" - config_name: "it-TZ" data_files: - split: train path: "data/it-TZ/*.parquet" - config_name: "it-UA" data_files: - split: train path: "data/it-UA/*.parquet" - config_name: "it-UK" data_files: - split: train path: "data/it-UK/*.parquet" - config_name: "it-UM" data_files: - split: train path: "data/it-UM/*.parquet" - config_name: "it-US" data_files: - split: train path: "data/it-US/*.parquet" - config_name: "it-VA" data_files: - split: train path: "data/it-VA/*.parquet" - config_name: "it-VE" data_files: - split: train path: "data/it-VE/*.parquet" - config_name: "it-VN" data_files: - split: train path: "data/it-VN/*.parquet" - config_name: "it-WS" data_files: - split: train path: "data/it-WS/*.parquet" - config_name: "it-WW" data_files: - split: train path: "data/it-WW/*.parquet" - config_name: "it-XX" data_files: - split: train path: "data/it-XX/*.parquet" - config_name: "it-ZA" data_files: - split: train path: "data/it-ZA/*.parquet" - config_name: "ja-AE" data_files: - split: train path: "data/ja-AE/*.parquet" - config_name: "ja-AF" data_files: - split: train path: "data/ja-AF/*.parquet" - config_name: "ja-AI" data_files: - split: train path: "data/ja-AI/*.parquet" - config_name: "ja-AL" data_files: - split: train path: "data/ja-AL/*.parquet" - config_name: "ja-AM" data_files: - split: train path: "data/ja-AM/*.parquet" - config_name: "ja-AR" data_files: - split: train path: "data/ja-AR/*.parquet" - config_name: "ja-AS" data_files: - split: train path: "data/ja-AS/*.parquet" - config_name: "ja-AT" data_files: - split: train path: "data/ja-AT/*.parquet" - config_name: "ja-AU" data_files: - split: train path: "data/ja-AU/*.parquet" - config_name: "ja-AZ" data_files: - split: train path: "data/ja-AZ/*.parquet" - config_name: "ja-BA" data_files: - split: train path: "data/ja-BA/*.parquet" - config_name: "ja-BD" data_files: - split: train path: "data/ja-BD/*.parquet" - config_name: "ja-BE" data_files: - split: train path: "data/ja-BE/*.parquet" - config_name: "ja-BF" data_files: - split: train path: "data/ja-BF/*.parquet" - config_name: "ja-BG" data_files: - split: train path: "data/ja-BG/*.parquet" - config_name: "ja-BO" data_files: - split: train path: "data/ja-BO/*.parquet" - config_name: "ja-BR" data_files: - split: train path: "data/ja-BR/*.parquet" - config_name: "ja-BY" data_files: - split: train path: "data/ja-BY/*.parquet" - config_name: "ja-BZ" data_files: - split: train path: "data/ja-BZ/*.parquet" - config_name: "ja-CA" data_files: - split: train path: "data/ja-CA/*.parquet" - config_name: "ja-CC" data_files: - split: train path: "data/ja-CC/*.parquet" - config_name: "ja-CD" data_files: - split: train path: "data/ja-CD/*.parquet" - config_name: "ja-CF" data_files: - split: train path: "data/ja-CF/*.parquet" - config_name: "ja-CH" data_files: - split: train path: "data/ja-CH/*.parquet" - config_name: "ja-CL" data_files: - split: train path: "data/ja-CL/*.parquet" - config_name: "ja-CM" data_files: - split: train path: "data/ja-CM/*.parquet" - config_name: "ja-CN" data_files: - split: train path: "data/ja-CN/*.parquet" - config_name: "ja-CO" data_files: - split: train path: "data/ja-CO/*.parquet" - config_name: "ja-CR" data_files: - split: train path: "data/ja-CR/*.parquet" - config_name: "ja-CX" data_files: - split: train path: "data/ja-CX/*.parquet" - config_name: "ja-CZ" data_files: - split: train path: "data/ja-CZ/*.parquet" - config_name: "ja-DE" data_files: - split: train path: "data/ja-DE/*.parquet" - config_name: "ja-DK" data_files: - split: train path: "data/ja-DK/*.parquet" - config_name: "ja-DO" data_files: - split: train path: "data/ja-DO/*.parquet" - config_name: "ja-DZ" data_files: - split: train path: "data/ja-DZ/*.parquet" - config_name: "ja-EC" data_files: - split: train path: "data/ja-EC/*.parquet" - config_name: "ja-EE" data_files: - split: train path: "data/ja-EE/*.parquet" - config_name: "ja-EG" data_files: - split: train path: "data/ja-EG/*.parquet" - config_name: "ja-EN" data_files: - split: train path: "data/ja-EN/*.parquet" - config_name: "ja-ES" data_files: - split: train path: "data/ja-ES/*.parquet" - config_name: "ja-EU" data_files: - split: train path: "data/ja-EU/*.parquet" - config_name: "ja-FI" data_files: - split: train path: "data/ja-FI/*.parquet" - config_name: "ja-FJ" data_files: - split: train path: "data/ja-FJ/*.parquet" - config_name: "ja-FM" data_files: - split: train path: "data/ja-FM/*.parquet" - config_name: "ja-FR" data_files: - split: train path: "data/ja-FR/*.parquet" - config_name: "ja-GA" data_files: - split: train path: "data/ja-GA/*.parquet" - config_name: "ja-GB" data_files: - split: train path: "data/ja-GB/*.parquet" - config_name: "ja-GE" data_files: - split: train path: "data/ja-GE/*.parquet" - config_name: "ja-GG" data_files: - split: train path: "data/ja-GG/*.parquet" - config_name: "ja-GH" data_files: - split: train path: "data/ja-GH/*.parquet" - config_name: "ja-GQ" data_files: - split: train path: "data/ja-GQ/*.parquet" - config_name: "ja-GR" data_files: - split: train path: "data/ja-GR/*.parquet" - config_name: "ja-GT" data_files: - split: train path: "data/ja-GT/*.parquet" - config_name: "ja-HK" data_files: - split: train path: "data/ja-HK/*.parquet" - config_name: "ja-HR" data_files: - split: train path: "data/ja-HR/*.parquet" - config_name: "ja-HU" data_files: - split: train path: "data/ja-HU/*.parquet" - config_name: "ja-ID" data_files: - split: train path: "data/ja-ID/*.parquet" - config_name: "ja-IE" data_files: - split: train path: "data/ja-IE/*.parquet" - config_name: "ja-IL" data_files: - split: train path: "data/ja-IL/*.parquet" - config_name: "ja-IN" data_files: - split: train path: "data/ja-IN/*.parquet" - config_name: "ja-IO" data_files: - split: train path: "data/ja-IO/*.parquet" - config_name: "ja-IR" data_files: - split: train path: "data/ja-IR/*.parquet" - config_name: "ja-IS" data_files: - split: train path: "data/ja-IS/*.parquet" - config_name: "ja-IT" data_files: - split: train path: "data/ja-IT/*.parquet" - config_name: "ja-JA" data_files: - split: train path: "data/ja-JA/*.parquet" - config_name: "ja-JO" data_files: - split: train path: "data/ja-JO/*.parquet" - config_name: "ja-JP" data_files: - split: train path: "data/ja-JP/*.parquet" - config_name: "ja-KE" data_files: - split: train path: "data/ja-KE/*.parquet" - config_name: "ja-KG" data_files: - split: train path: "data/ja-KG/*.parquet" - config_name: "ja-KR" data_files: - split: train path: "data/ja-KR/*.parquet" - config_name: "ja-KZ" data_files: - split: train path: "data/ja-KZ/*.parquet" - config_name: "ja-LA" data_files: - split: train path: "data/ja-LA/*.parquet" - config_name: "ja-LB" data_files: - split: train path: "data/ja-LB/*.parquet" - config_name: "ja-LI" data_files: - split: train path: "data/ja-LI/*.parquet" - config_name: "ja-LK" data_files: - split: train path: "data/ja-LK/*.parquet" - config_name: "ja-LT" data_files: - split: train path: "data/ja-LT/*.parquet" - config_name: "ja-LU" data_files: - split: train path: "data/ja-LU/*.parquet" - config_name: "ja-LV" data_files: - split: train path: "data/ja-LV/*.parquet" - config_name: "ja-LY" data_files: - split: train path: "data/ja-LY/*.parquet" - config_name: "ja-MA" data_files: - split: train path: "data/ja-MA/*.parquet" - config_name: "ja-MD" data_files: - split: train path: "data/ja-MD/*.parquet" - config_name: "ja-ME" data_files: - split: train path: "data/ja-ME/*.parquet" - config_name: "ja-MG" data_files: - split: train path: "data/ja-MG/*.parquet" - config_name: "ja-MK" data_files: - split: train path: "data/ja-MK/*.parquet" - config_name: "ja-ML" data_files: - split: train path: "data/ja-ML/*.parquet" - config_name: "ja-MN" data_files: - split: train path: "data/ja-MN/*.parquet" - config_name: "ja-MQ" data_files: - split: train path: "data/ja-MQ/*.parquet" - config_name: "ja-MR" data_files: - split: train path: "data/ja-MR/*.parquet" - config_name: "ja-MS" data_files: - split: train path: "data/ja-MS/*.parquet" - config_name: "ja-MT" data_files: - split: train path: "data/ja-MT/*.parquet" - config_name: "ja-MU" data_files: - split: train path: "data/ja-MU/*.parquet" - config_name: "ja-MX" data_files: - split: train path: "data/ja-MX/*.parquet" - config_name: "ja-MY" data_files: - split: train path: "data/ja-MY/*.parquet" - config_name: "ja-MZ" data_files: - split: train path: "data/ja-MZ/*.parquet" - config_name: "ja-NF" data_files: - split: train path: "data/ja-NF/*.parquet" - config_name: "ja-NG" data_files: - split: train path: "data/ja-NG/*.parquet" - config_name: "ja-NL" data_files: - split: train path: "data/ja-NL/*.parquet" - config_name: "ja-NO" data_files: - split: train path: "data/ja-NO/*.parquet" - config_name: "ja-NP" data_files: - split: train path: "data/ja-NP/*.parquet" - config_name: "ja-NU" data_files: - split: train path: "data/ja-NU/*.parquet" - config_name: "ja-NZ" data_files: - split: train path: "data/ja-NZ/*.parquet" - config_name: "ja-PE" data_files: - split: train path: "data/ja-PE/*.parquet" - config_name: "ja-PH" data_files: - split: train path: "data/ja-PH/*.parquet" - config_name: "ja-PK" data_files: - split: train path: "data/ja-PK/*.parquet" - config_name: "ja-PL" data_files: - split: train path: "data/ja-PL/*.parquet" - config_name: "ja-PM" data_files: - split: train path: "data/ja-PM/*.parquet" - config_name: "ja-PS" data_files: - split: train path: "data/ja-PS/*.parquet" - config_name: "ja-PT" data_files: - split: train path: "data/ja-PT/*.parquet" - config_name: "ja-PW" data_files: - split: train path: "data/ja-PW/*.parquet" - config_name: "ja-PY" data_files: - split: train path: "data/ja-PY/*.parquet" - config_name: "ja-RE" data_files: - split: train path: "data/ja-RE/*.parquet" - config_name: "ja-RO" data_files: - split: train path: "data/ja-RO/*.parquet" - config_name: "ja-RS" data_files: - split: train path: "data/ja-RS/*.parquet" - config_name: "ja-RU" data_files: - split: train path: "data/ja-RU/*.parquet" - config_name: "ja-RW" data_files: - split: train path: "data/ja-RW/*.parquet" - config_name: "ja-SA" data_files: - split: train path: "data/ja-SA/*.parquet" - config_name: "ja-SC" data_files: - split: train path: "data/ja-SC/*.parquet" - config_name: "ja-SE" data_files: - split: train path: "data/ja-SE/*.parquet" - config_name: "ja-SG" data_files: - split: train path: "data/ja-SG/*.parquet" - config_name: "ja-SH" data_files: - split: train path: "data/ja-SH/*.parquet" - config_name: "ja-SI" data_files: - split: train path: "data/ja-SI/*.parquet" - config_name: "ja-SK" data_files: - split: train path: "data/ja-SK/*.parquet" - config_name: "ja-SN" data_files: - split: train path: "data/ja-SN/*.parquet" - config_name: "ja-SO" data_files: - split: train path: "data/ja-SO/*.parquet" - config_name: "ja-ST" data_files: - split: train path: "data/ja-ST/*.parquet" - config_name: "ja-SZ" data_files: - split: train path: "data/ja-SZ/*.parquet" - config_name: "ja-TC" data_files: - split: train path: "data/ja-TC/*.parquet" - config_name: "ja-TG" data_files: - split: train path: "data/ja-TG/*.parquet" - config_name: "ja-TH" data_files: - split: train path: "data/ja-TH/*.parquet" - config_name: "ja-TJ" data_files: - split: train path: "data/ja-TJ/*.parquet" - config_name: "ja-TK" data_files: - split: train path: "data/ja-TK/*.parquet" - config_name: "ja-TO" data_files: - split: train path: "data/ja-TO/*.parquet" - config_name: "ja-TR" data_files: - split: train path: "data/ja-TR/*.parquet" - config_name: "ja-TV" data_files: - split: train path: "data/ja-TV/*.parquet" - config_name: "ja-TW" data_files: - split: train path: "data/ja-TW/*.parquet" - config_name: "ja-TZ" data_files: - split: train path: "data/ja-TZ/*.parquet" - config_name: "ja-UA" data_files: - split: train path: "data/ja-UA/*.parquet" - config_name: "ja-UG" data_files: - split: train path: "data/ja-UG/*.parquet" - config_name: "ja-US" data_files: - split: train path: "data/ja-US/*.parquet" - config_name: "ja-UY" data_files: - split: train path: "data/ja-UY/*.parquet" - config_name: "ja-UZ" data_files: - split: train path: "data/ja-UZ/*.parquet" - config_name: "ja-VC" data_files: - split: train path: "data/ja-VC/*.parquet" - config_name: "ja-VE" data_files: - split: train path: "data/ja-VE/*.parquet" - config_name: "ja-VG" data_files: - split: train path: "data/ja-VG/*.parquet" - config_name: "ja-VN" data_files: - split: train path: "data/ja-VN/*.parquet" - config_name: "ja-WS" data_files: - split: train path: "data/ja-WS/*.parquet" - config_name: "ja-XX" data_files: - split: train path: "data/ja-XX/*.parquet" - config_name: "ja-ZA" data_files: - split: train path: "data/ja-ZA/*.parquet" - config_name: "ja-ZM" data_files: - split: train path: "data/ja-ZM/*.parquet" - config_name: "ja-ZW" data_files: - split: train path: "data/ja-ZW/*.parquet" - config_name: "jbo-XX" data_files: - split: train path: "data/jbo-XX/*.parquet" - config_name: "jv-XX" data_files: - split: train path: "data/jv-XX/*.parquet" - config_name: "ka-AM" data_files: - split: train path: "data/ka-AM/*.parquet" - config_name: "ka-AZ" data_files: - split: train path: "data/ka-AZ/*.parquet" - config_name: "ka-CC" data_files: - split: train path: "data/ka-CC/*.parquet" - config_name: "ka-CO" data_files: - split: train path: "data/ka-CO/*.parquet" - config_name: "ka-DE" data_files: - split: train path: "data/ka-DE/*.parquet" - config_name: "ka-EU" data_files: - split: train path: "data/ka-EU/*.parquet" - config_name: "ka-GB" data_files: - split: train path: "data/ka-GB/*.parquet" - config_name: "ka-GE" data_files: - split: train path: "data/ka-GE/*.parquet" - config_name: "ka-IN" data_files: - split: train path: "data/ka-IN/*.parquet" - config_name: "ka-IO" data_files: - split: train path: "data/ka-IO/*.parquet" - config_name: "ka-IR" data_files: - split: train path: "data/ka-IR/*.parquet" - config_name: "ka-JP" data_files: - split: train path: "data/ka-JP/*.parquet" - config_name: "ka-KA" data_files: - split: train path: "data/ka-KA/*.parquet" - config_name: "ka-KZ" data_files: - split: train path: "data/ka-KZ/*.parquet" - config_name: "ka-ME" data_files: - split: train path: "data/ka-ME/*.parquet" - config_name: "ka-RS" data_files: - split: train path: "data/ka-RS/*.parquet" - config_name: "ka-RU" data_files: - split: train path: "data/ka-RU/*.parquet" - config_name: "ka-SK" data_files: - split: train path: "data/ka-SK/*.parquet" - config_name: "ka-TO" data_files: - split: train path: "data/ka-TO/*.parquet" - config_name: "ka-TR" data_files: - split: train path: "data/ka-TR/*.parquet" - config_name: "ka-TV" data_files: - split: train path: "data/ka-TV/*.parquet" - config_name: "ka-UA" data_files: - split: train path: "data/ka-UA/*.parquet" - config_name: "ka-US" data_files: - split: train path: "data/ka-US/*.parquet" - config_name: "ka-WS" data_files: - split: train path: "data/ka-WS/*.parquet" - config_name: "ka-XX" data_files: - split: train path: "data/ka-XX/*.parquet" - config_name: "kk-AI" data_files: - split: train path: "data/kk-AI/*.parquet" - config_name: "kk-BR" data_files: - split: train path: "data/kk-BR/*.parquet" - config_name: "kk-CN" data_files: - split: train path: "data/kk-CN/*.parquet" - config_name: "kk-DE" data_files: - split: train path: "data/kk-DE/*.parquet" - config_name: "kk-EU" data_files: - split: train path: "data/kk-EU/*.parquet" - config_name: "kk-FM" data_files: - split: train path: "data/kk-FM/*.parquet" - config_name: "kk-FR" data_files: - split: train path: "data/kk-FR/*.parquet" - config_name: "kk-GB" data_files: - split: train path: "data/kk-GB/*.parquet" - config_name: "kk-HK" data_files: - split: train path: "data/kk-HK/*.parquet" - config_name: "kk-IN" data_files: - split: train path: "data/kk-IN/*.parquet" - config_name: "kk-KG" data_files: - split: train path: "data/kk-KG/*.parquet" - config_name: "kk-KK" data_files: - split: train path: "data/kk-KK/*.parquet" - config_name: "kk-KZ" data_files: - split: train path: "data/kk-KZ/*.parquet" - config_name: "kk-ME" data_files: - split: train path: "data/kk-ME/*.parquet" - config_name: "kk-PE" data_files: - split: train path: "data/kk-PE/*.parquet" - config_name: "kk-QR" data_files: - split: train path: "data/kk-QR/*.parquet" - config_name: "kk-RU" data_files: - split: train path: "data/kk-RU/*.parquet" - config_name: "kk-SE" data_files: - split: train path: "data/kk-SE/*.parquet" - config_name: "kk-SG" data_files: - split: train path: "data/kk-SG/*.parquet" - config_name: "kk-TR" data_files: - split: train path: "data/kk-TR/*.parquet" - config_name: "kk-TV" data_files: - split: train path: "data/kk-TV/*.parquet" - config_name: "kk-UA" data_files: - split: train path: "data/kk-UA/*.parquet" - config_name: "kk-US" data_files: - split: train path: "data/kk-US/*.parquet" - config_name: "kk-UZ" data_files: - split: train path: "data/kk-UZ/*.parquet" - config_name: "kk-WS" data_files: - split: train path: "data/kk-WS/*.parquet" - config_name: "kk-XX" data_files: - split: train path: "data/kk-XX/*.parquet" - config_name: "km-AU" data_files: - split: train path: "data/km-AU/*.parquet" - config_name: "km-CN" data_files: - split: train path: "data/km-CN/*.parquet" - config_name: "km-ES" data_files: - split: train path: "data/km-ES/*.parquet" - config_name: "km-FR" data_files: - split: train path: "data/km-FR/*.parquet" - config_name: "km-KH" data_files: - split: train path: "data/km-KH/*.parquet" - config_name: "km-TV" data_files: - split: train path: "data/km-TV/*.parquet" - config_name: "km-US" data_files: - split: train path: "data/km-US/*.parquet" - config_name: "km-VN" data_files: - split: train path: "data/km-VN/*.parquet" - config_name: "km-XX" data_files: - split: train path: "data/km-XX/*.parquet" - config_name: "kn-GB" data_files: - split: train path: "data/kn-GB/*.parquet" - config_name: "kn-IN" data_files: - split: train path: "data/kn-IN/*.parquet" - config_name: "kn-RU" data_files: - split: train path: "data/kn-RU/*.parquet" - config_name: "kn-US" data_files: - split: train path: "data/kn-US/*.parquet" - config_name: "kn-XX" data_files: - split: train path: "data/kn-XX/*.parquet" - config_name: "ko-AE" data_files: - split: train path: "data/ko-AE/*.parquet" - config_name: "ko-AI" data_files: - split: train path: "data/ko-AI/*.parquet" - config_name: "ko-AM" data_files: - split: train path: "data/ko-AM/*.parquet" - config_name: "ko-AR" data_files: - split: train path: "data/ko-AR/*.parquet" - config_name: "ko-AT" data_files: - split: train path: "data/ko-AT/*.parquet" - config_name: "ko-AU" data_files: - split: train path: "data/ko-AU/*.parquet" - config_name: "ko-BE" data_files: - split: train path: "data/ko-BE/*.parquet" - config_name: "ko-BG" data_files: - split: train path: "data/ko-BG/*.parquet" - config_name: "ko-BO" data_files: - split: train path: "data/ko-BO/*.parquet" - config_name: "ko-BR" data_files: - split: train path: "data/ko-BR/*.parquet" - config_name: "ko-BZ" data_files: - split: train path: "data/ko-BZ/*.parquet" - config_name: "ko-CA" data_files: - split: train path: "data/ko-CA/*.parquet" - config_name: "ko-CC" data_files: - split: train path: "data/ko-CC/*.parquet" - config_name: "ko-CD" data_files: - split: train path: "data/ko-CD/*.parquet" - config_name: "ko-CH" data_files: - split: train path: "data/ko-CH/*.parquet" - config_name: "ko-CL" data_files: - split: train path: "data/ko-CL/*.parquet" - config_name: "ko-CN" data_files: - split: train path: "data/ko-CN/*.parquet" - config_name: "ko-CO" data_files: - split: train path: "data/ko-CO/*.parquet" - config_name: "ko-CZ" data_files: - split: train path: "data/ko-CZ/*.parquet" - config_name: "ko-DE" data_files: - split: train path: "data/ko-DE/*.parquet" - config_name: "ko-DK" data_files: - split: train path: "data/ko-DK/*.parquet" - config_name: "ko-DO" data_files: - split: train path: "data/ko-DO/*.parquet" - config_name: "ko-EC" data_files: - split: train path: "data/ko-EC/*.parquet" - config_name: "ko-EN" data_files: - split: train path: "data/ko-EN/*.parquet" - config_name: "ko-ES" data_files: - split: train path: "data/ko-ES/*.parquet" - config_name: "ko-EU" data_files: - split: train path: "data/ko-EU/*.parquet" - config_name: "ko-FI" data_files: - split: train path: "data/ko-FI/*.parquet" - config_name: "ko-FM" data_files: - split: train path: "data/ko-FM/*.parquet" - config_name: "ko-FR" data_files: - split: train path: "data/ko-FR/*.parquet" - config_name: "ko-GB" data_files: - split: train path: "data/ko-GB/*.parquet" - config_name: "ko-GR" data_files: - split: train path: "data/ko-GR/*.parquet" - config_name: "ko-HK" data_files: - split: train path: "data/ko-HK/*.parquet" - config_name: "ko-HR" data_files: - split: train path: "data/ko-HR/*.parquet" - config_name: "ko-HU" data_files: - split: train path: "data/ko-HU/*.parquet" - config_name: "ko-ID" data_files: - split: train path: "data/ko-ID/*.parquet" - config_name: "ko-IE" data_files: - split: train path: "data/ko-IE/*.parquet" - config_name: "ko-IM" data_files: - split: train path: "data/ko-IM/*.parquet" - config_name: "ko-IN" data_files: - split: train path: "data/ko-IN/*.parquet" - config_name: "ko-IO" data_files: - split: train path: "data/ko-IO/*.parquet" - config_name: "ko-IR" data_files: - split: train path: "data/ko-IR/*.parquet" - config_name: "ko-IS" data_files: - split: train path: "data/ko-IS/*.parquet" - config_name: "ko-IT" data_files: - split: train path: "data/ko-IT/*.parquet" - config_name: "ko-JP" data_files: - split: train path: "data/ko-JP/*.parquet" - config_name: "ko-KH" data_files: - split: train path: "data/ko-KH/*.parquet" - config_name: "ko-KO" data_files: - split: train path: "data/ko-KO/*.parquet" - config_name: "ko-KP" data_files: - split: train path: "data/ko-KP/*.parquet" - config_name: "ko-KR" data_files: - split: train path: "data/ko-KR/*.parquet" - config_name: "ko-KY" data_files: - split: train path: "data/ko-KY/*.parquet" - config_name: "ko-LY" data_files: - split: train path: "data/ko-LY/*.parquet" - config_name: "ko-MA" data_files: - split: train path: "data/ko-MA/*.parquet" - config_name: "ko-ME" data_files: - split: train path: "data/ko-ME/*.parquet" - config_name: "ko-MO" data_files: - split: train path: "data/ko-MO/*.parquet" - config_name: "ko-MX" data_files: - split: train path: "data/ko-MX/*.parquet" - config_name: "ko-MY" data_files: - split: train path: "data/ko-MY/*.parquet" - config_name: "ko-NG" data_files: - split: train path: "data/ko-NG/*.parquet" - config_name: "ko-NL" data_files: - split: train path: "data/ko-NL/*.parquet" - config_name: "ko-NO" data_files: - split: train path: "data/ko-NO/*.parquet" - config_name: "ko-NZ" data_files: - split: train path: "data/ko-NZ/*.parquet" - config_name: "ko-PE" data_files: - split: train path: "data/ko-PE/*.parquet" - config_name: "ko-PH" data_files: - split: train path: "data/ko-PH/*.parquet" - config_name: "ko-PK" data_files: - split: train path: "data/ko-PK/*.parquet" - config_name: "ko-PL" data_files: - split: train path: "data/ko-PL/*.parquet" - config_name: "ko-PT" data_files: - split: train path: "data/ko-PT/*.parquet" - config_name: "ko-PW" data_files: - split: train path: "data/ko-PW/*.parquet" - config_name: "ko-PY" data_files: - split: train path: "data/ko-PY/*.parquet" - config_name: "ko-RE" data_files: - split: train path: "data/ko-RE/*.parquet" - config_name: "ko-RO" data_files: - split: train path: "data/ko-RO/*.parquet" - config_name: "ko-RS" data_files: - split: train path: "data/ko-RS/*.parquet" - config_name: "ko-RU" data_files: - split: train path: "data/ko-RU/*.parquet" - config_name: "ko-SE" data_files: - split: train path: "data/ko-SE/*.parquet" - config_name: "ko-SG" data_files: - split: train path: "data/ko-SG/*.parquet" - config_name: "ko-SH" data_files: - split: train path: "data/ko-SH/*.parquet" - config_name: "ko-SI" data_files: - split: train path: "data/ko-SI/*.parquet" - config_name: "ko-SO" data_files: - split: train path: "data/ko-SO/*.parquet" - config_name: "ko-TC" data_files: - split: train path: "data/ko-TC/*.parquet" - config_name: "ko-TH" data_files: - split: train path: "data/ko-TH/*.parquet" - config_name: "ko-TK" data_files: - split: train path: "data/ko-TK/*.parquet" - config_name: "ko-TO" data_files: - split: train path: "data/ko-TO/*.parquet" - config_name: "ko-TR" data_files: - split: train path: "data/ko-TR/*.parquet" - config_name: "ko-TV" data_files: - split: train path: "data/ko-TV/*.parquet" - config_name: "ko-TW" data_files: - split: train path: "data/ko-TW/*.parquet" - config_name: "ko-UA" data_files: - split: train path: "data/ko-UA/*.parquet" - config_name: "ko-US" data_files: - split: train path: "data/ko-US/*.parquet" - config_name: "ko-VA" data_files: - split: train path: "data/ko-VA/*.parquet" - config_name: "ko-VN" data_files: - split: train path: "data/ko-VN/*.parquet" - config_name: "ko-WS" data_files: - split: train path: "data/ko-WS/*.parquet" - config_name: "ko-WW" data_files: - split: train path: "data/ko-WW/*.parquet" - config_name: "ko-XX" data_files: - split: train path: "data/ko-XX/*.parquet" - config_name: "krc-RU" data_files: - split: train path: "data/krc-RU/*.parquet" - config_name: "krc-XX" data_files: - split: train path: "data/krc-XX/*.parquet" - config_name: "ku-BE" data_files: - split: train path: "data/ku-BE/*.parquet" - config_name: "ku-DE" data_files: - split: train path: "data/ku-DE/*.parquet" - config_name: "ku-DK" data_files: - split: train path: "data/ku-DK/*.parquet" - config_name: "ku-GB" data_files: - split: train path: "data/ku-GB/*.parquet" - config_name: "ku-KR" data_files: - split: train path: "data/ku-KR/*.parquet" - config_name: "ku-KU" data_files: - split: train path: "data/ku-KU/*.parquet" - config_name: "ku-NL" data_files: - split: train path: "data/ku-NL/*.parquet" - config_name: "ku-RU" data_files: - split: train path: "data/ku-RU/*.parquet" - config_name: "ku-SE" data_files: - split: train path: "data/ku-SE/*.parquet" - config_name: "ku-TR" data_files: - split: train path: "data/ku-TR/*.parquet" - config_name: "ku-TV" data_files: - split: train path: "data/ku-TV/*.parquet" - config_name: "ku-US" data_files: - split: train path: "data/ku-US/*.parquet" - config_name: "ku-XX" data_files: - split: train path: "data/ku-XX/*.parquet" - config_name: "kv-XX" data_files: - split: train path: "data/kv-XX/*.parquet" - config_name: "kw-IM" data_files: - split: train path: "data/kw-IM/*.parquet" - config_name: "kw-XX" data_files: - split: train path: "data/kw-XX/*.parquet" - config_name: "ky-BG" data_files: - split: train path: "data/ky-BG/*.parquet" - config_name: "ky-EU" data_files: - split: train path: "data/ky-EU/*.parquet" - config_name: "ky-FR" data_files: - split: train path: "data/ky-FR/*.parquet" - config_name: "ky-GB" data_files: - split: train path: "data/ky-GB/*.parquet" - config_name: "ky-IO" data_files: - split: train path: "data/ky-IO/*.parquet" - config_name: "ky-KG" data_files: - split: train path: "data/ky-KG/*.parquet" - config_name: "ky-KY" data_files: - split: train path: "data/ky-KY/*.parquet" - config_name: "ky-KZ" data_files: - split: train path: "data/ky-KZ/*.parquet" - config_name: "ky-RU" data_files: - split: train path: "data/ky-RU/*.parquet" - config_name: "ky-TV" data_files: - split: train path: "data/ky-TV/*.parquet" - config_name: "ky-US" data_files: - split: train path: "data/ky-US/*.parquet" - config_name: "ky-UZ" data_files: - split: train path: "data/ky-UZ/*.parquet" - config_name: "ky-XX" data_files: - split: train path: "data/ky-XX/*.parquet" - config_name: "la-AR" data_files: - split: train path: "data/la-AR/*.parquet" - config_name: "la-AU" data_files: - split: train path: "data/la-AU/*.parquet" - config_name: "la-BE" data_files: - split: train path: "data/la-BE/*.parquet" - config_name: "la-BR" data_files: - split: train path: "data/la-BR/*.parquet" - config_name: "la-CH" data_files: - split: train path: "data/la-CH/*.parquet" - config_name: "la-CN" data_files: - split: train path: "data/la-CN/*.parquet" - config_name: "la-CO" data_files: - split: train path: "data/la-CO/*.parquet" - config_name: "la-DE" data_files: - split: train path: "data/la-DE/*.parquet" - config_name: "la-DK" data_files: - split: train path: "data/la-DK/*.parquet" - config_name: "la-ES" data_files: - split: train path: "data/la-ES/*.parquet" - config_name: "la-FI" data_files: - split: train path: "data/la-FI/*.parquet" - config_name: "la-FR" data_files: - split: train path: "data/la-FR/*.parquet" - config_name: "la-GB" data_files: - split: train path: "data/la-GB/*.parquet" - config_name: "la-ID" data_files: - split: train path: "data/la-ID/*.parquet" - config_name: "la-IO" data_files: - split: train path: "data/la-IO/*.parquet" - config_name: "la-IS" data_files: - split: train path: "data/la-IS/*.parquet" - config_name: "la-IT" data_files: - split: train path: "data/la-IT/*.parquet" - config_name: "la-LK" data_files: - split: train path: "data/la-LK/*.parquet" - config_name: "la-ME" data_files: - split: train path: "data/la-ME/*.parquet" - config_name: "la-MX" data_files: - split: train path: "data/la-MX/*.parquet" - config_name: "la-NL" data_files: - split: train path: "data/la-NL/*.parquet" - config_name: "la-SI" data_files: - split: train path: "data/la-SI/*.parquet" - config_name: "la-US" data_files: - split: train path: "data/la-US/*.parquet" - config_name: "la-VN" data_files: - split: train path: "data/la-VN/*.parquet" - config_name: "la-XX" data_files: - split: train path: "data/la-XX/*.parquet" - config_name: "lb-BE" data_files: - split: train path: "data/lb-BE/*.parquet" - config_name: "lb-DE" data_files: - split: train path: "data/lb-DE/*.parquet" - config_name: "lb-EU" data_files: - split: train path: "data/lb-EU/*.parquet" - config_name: "lb-FR" data_files: - split: train path: "data/lb-FR/*.parquet" - config_name: "lb-HR" data_files: - split: train path: "data/lb-HR/*.parquet" - config_name: "lb-IE" data_files: - split: train path: "data/lb-IE/*.parquet" - config_name: "lb-LU" data_files: - split: train path: "data/lb-LU/*.parquet" - config_name: "lb-US" data_files: - split: train path: "data/lb-US/*.parquet" - config_name: "lb-XX" data_files: - split: train path: "data/lb-XX/*.parquet" - config_name: "lez-RU" data_files: - split: train path: "data/lez-RU/*.parquet" - config_name: "lez-XX" data_files: - split: train path: "data/lez-XX/*.parquet" - config_name: "li-XX" data_files: - split: train path: "data/li-XX/*.parquet" - config_name: "lmo-XX" data_files: - split: train path: "data/lmo-XX/*.parquet" - config_name: "lo-CN" data_files: - split: train path: "data/lo-CN/*.parquet" - config_name: "lo-ES" data_files: - split: train path: "data/lo-ES/*.parquet" - config_name: "lo-GB" data_files: - split: train path: "data/lo-GB/*.parquet" - config_name: "lo-LA" data_files: - split: train path: "data/lo-LA/*.parquet" - config_name: "lo-SI" data_files: - split: train path: "data/lo-SI/*.parquet" - config_name: "lo-US" data_files: - split: train path: "data/lo-US/*.parquet" - config_name: "lo-VN" data_files: - split: train path: "data/lo-VN/*.parquet" - config_name: "lo-XX" data_files: - split: train path: "data/lo-XX/*.parquet" - config_name: "lt-AI" data_files: - split: train path: "data/lt-AI/*.parquet" - config_name: "lt-AM" data_files: - split: train path: "data/lt-AM/*.parquet" - config_name: "lt-AT" data_files: - split: train path: "data/lt-AT/*.parquet" - config_name: "lt-BM" data_files: - split: train path: "data/lt-BM/*.parquet" - config_name: "lt-BR" data_files: - split: train path: "data/lt-BR/*.parquet" - config_name: "lt-CA" data_files: - split: train path: "data/lt-CA/*.parquet" - config_name: "lt-CF" data_files: - split: train path: "data/lt-CF/*.parquet" - config_name: "lt-CH" data_files: - split: train path: "data/lt-CH/*.parquet" - config_name: "lt-CN" data_files: - split: train path: "data/lt-CN/*.parquet" - config_name: "lt-CO" data_files: - split: train path: "data/lt-CO/*.parquet" - config_name: "lt-CZ" data_files: - split: train path: "data/lt-CZ/*.parquet" - config_name: "lt-DE" data_files: - split: train path: "data/lt-DE/*.parquet" - config_name: "lt-DK" data_files: - split: train path: "data/lt-DK/*.parquet" - config_name: "lt-EE" data_files: - split: train path: "data/lt-EE/*.parquet" - config_name: "lt-EN" data_files: - split: train path: "data/lt-EN/*.parquet" - config_name: "lt-ES" data_files: - split: train path: "data/lt-ES/*.parquet" - config_name: "lt-EU" data_files: - split: train path: "data/lt-EU/*.parquet" - config_name: "lt-FI" data_files: - split: train path: "data/lt-FI/*.parquet" - config_name: "lt-FM" data_files: - split: train path: "data/lt-FM/*.parquet" - config_name: "lt-FR" data_files: - split: train path: "data/lt-FR/*.parquet" - config_name: "lt-GB" data_files: - split: train path: "data/lt-GB/*.parquet" - config_name: "lt-IN" data_files: - split: train path: "data/lt-IN/*.parquet" - config_name: "lt-IO" data_files: - split: train path: "data/lt-IO/*.parquet" - config_name: "lt-IT" data_files: - split: train path: "data/lt-IT/*.parquet" - config_name: "lt-JP" data_files: - split: train path: "data/lt-JP/*.parquet" - config_name: "lt-LT" data_files: - split: train path: "data/lt-LT/*.parquet" - config_name: "lt-LU" data_files: - split: train path: "data/lt-LU/*.parquet" - config_name: "lt-LV" data_files: - split: train path: "data/lt-LV/*.parquet" - config_name: "lt-NL" data_files: - split: train path: "data/lt-NL/*.parquet" - config_name: "lt-NO" data_files: - split: train path: "data/lt-NO/*.parquet" - config_name: "lt-PL" data_files: - split: train path: "data/lt-PL/*.parquet" - config_name: "lt-PT" data_files: - split: train path: "data/lt-PT/*.parquet" - config_name: "lt-RO" data_files: - split: train path: "data/lt-RO/*.parquet" - config_name: "lt-RU" data_files: - split: train path: "data/lt-RU/*.parquet" - config_name: "lt-SE" data_files: - split: train path: "data/lt-SE/*.parquet" - config_name: "lt-SI" data_files: - split: train path: "data/lt-SI/*.parquet" - config_name: "lt-TV" data_files: - split: train path: "data/lt-TV/*.parquet" - config_name: "lt-UA" data_files: - split: train path: "data/lt-UA/*.parquet" - config_name: "lt-US" data_files: - split: train path: "data/lt-US/*.parquet" - config_name: "lt-VA" data_files: - split: train path: "data/lt-VA/*.parquet" - config_name: "lt-XX" data_files: - split: train path: "data/lt-XX/*.parquet" - config_name: "lv-AI" data_files: - split: train path: "data/lv-AI/*.parquet" - config_name: "lv-AM" data_files: - split: train path: "data/lv-AM/*.parquet" - config_name: "lv-AU" data_files: - split: train path: "data/lv-AU/*.parquet" - config_name: "lv-BE" data_files: - split: train path: "data/lv-BE/*.parquet" - config_name: "lv-CA" data_files: - split: train path: "data/lv-CA/*.parquet" - config_name: "lv-CH" data_files: - split: train path: "data/lv-CH/*.parquet" - config_name: "lv-CO" data_files: - split: train path: "data/lv-CO/*.parquet" - config_name: "lv-DE" data_files: - split: train path: "data/lv-DE/*.parquet" - config_name: "lv-EE" data_files: - split: train path: "data/lv-EE/*.parquet" - config_name: "lv-EU" data_files: - split: train path: "data/lv-EU/*.parquet" - config_name: "lv-FI" data_files: - split: train path: "data/lv-FI/*.parquet" - config_name: "lv-FM" data_files: - split: train path: "data/lv-FM/*.parquet" - config_name: "lv-FR" data_files: - split: train path: "data/lv-FR/*.parquet" - config_name: "lv-GB" data_files: - split: train path: "data/lv-GB/*.parquet" - config_name: "lv-IE" data_files: - split: train path: "data/lv-IE/*.parquet" - config_name: "lv-IO" data_files: - split: train path: "data/lv-IO/*.parquet" - config_name: "lv-LG" data_files: - split: train path: "data/lv-LG/*.parquet" - config_name: "lv-LT" data_files: - split: train path: "data/lv-LT/*.parquet" - config_name: "lv-LV" data_files: - split: train path: "data/lv-LV/*.parquet" - config_name: "lv-ME" data_files: - split: train path: "data/lv-ME/*.parquet" - config_name: "lv-NL" data_files: - split: train path: "data/lv-NL/*.parquet" - config_name: "lv-NO" data_files: - split: train path: "data/lv-NO/*.parquet" - config_name: "lv-PL" data_files: - split: train path: "data/lv-PL/*.parquet" - config_name: "lv-PW" data_files: - split: train path: "data/lv-PW/*.parquet" - config_name: "lv-RO" data_files: - split: train path: "data/lv-RO/*.parquet" - config_name: "lv-RU" data_files: - split: train path: "data/lv-RU/*.parquet" - config_name: "lv-SE" data_files: - split: train path: "data/lv-SE/*.parquet" - config_name: "lv-TV" data_files: - split: train path: "data/lv-TV/*.parquet" - config_name: "lv-US" data_files: - split: train path: "data/lv-US/*.parquet" - config_name: "lv-VA" data_files: - split: train path: "data/lv-VA/*.parquet" - config_name: "lv-WS" data_files: - split: train path: "data/lv-WS/*.parquet" - config_name: "lv-XX" data_files: - split: train path: "data/lv-XX/*.parquet" - config_name: "mg-ES" data_files: - split: train path: "data/mg-ES/*.parquet" - config_name: "mg-FR" data_files: - split: train path: "data/mg-FR/*.parquet" - config_name: "mg-GB" data_files: - split: train path: "data/mg-GB/*.parquet" - config_name: "mg-IN" data_files: - split: train path: "data/mg-IN/*.parquet" - config_name: "mg-MG" data_files: - split: train path: "data/mg-MG/*.parquet" - config_name: "mg-TO" data_files: - split: train path: "data/mg-TO/*.parquet" - config_name: "mg-US" data_files: - split: train path: "data/mg-US/*.parquet" - config_name: "mg-XX" data_files: - split: train path: "data/mg-XX/*.parquet" - config_name: "mhr-RU" data_files: - split: train path: "data/mhr-RU/*.parquet" - config_name: "mhr-XX" data_files: - split: train path: "data/mhr-XX/*.parquet" - config_name: "min-XX" data_files: - split: train path: "data/min-XX/*.parquet" - config_name: "mk-AL" data_files: - split: train path: "data/mk-AL/*.parquet" - config_name: "mk-AT" data_files: - split: train path: "data/mk-AT/*.parquet" - config_name: "mk-AU" data_files: - split: train path: "data/mk-AU/*.parquet" - config_name: "mk-BA" data_files: - split: train path: "data/mk-BA/*.parquet" - config_name: "mk-BG" data_files: - split: train path: "data/mk-BG/*.parquet" - config_name: "mk-BY" data_files: - split: train path: "data/mk-BY/*.parquet" - config_name: "mk-CC" data_files: - split: train path: "data/mk-CC/*.parquet" - config_name: "mk-CN" data_files: - split: train path: "data/mk-CN/*.parquet" - config_name: "mk-CO" data_files: - split: train path: "data/mk-CO/*.parquet" - config_name: "mk-DE" data_files: - split: train path: "data/mk-DE/*.parquet" - config_name: "mk-EU" data_files: - split: train path: "data/mk-EU/*.parquet" - config_name: "mk-FR" data_files: - split: train path: "data/mk-FR/*.parquet" - config_name: "mk-GB" data_files: - split: train path: "data/mk-GB/*.parquet" - config_name: "mk-GR" data_files: - split: train path: "data/mk-GR/*.parquet" - config_name: "mk-IS" data_files: - split: train path: "data/mk-IS/*.parquet" - config_name: "mk-IT" data_files: - split: train path: "data/mk-IT/*.parquet" - config_name: "mk-ME" data_files: - split: train path: "data/mk-ME/*.parquet" - config_name: "mk-MK" data_files: - split: train path: "data/mk-MK/*.parquet" - config_name: "mk-NU" data_files: - split: train path: "data/mk-NU/*.parquet" - config_name: "mk-PT" data_files: - split: train path: "data/mk-PT/*.parquet" - config_name: "mk-RO" data_files: - split: train path: "data/mk-RO/*.parquet" - config_name: "mk-RS" data_files: - split: train path: "data/mk-RS/*.parquet" - config_name: "mk-RU" data_files: - split: train path: "data/mk-RU/*.parquet" - config_name: "mk-TV" data_files: - split: train path: "data/mk-TV/*.parquet" - config_name: "mk-UA" data_files: - split: train path: "data/mk-UA/*.parquet" - config_name: "mk-US" data_files: - split: train path: "data/mk-US/*.parquet" - config_name: "mk-VA" data_files: - split: train path: "data/mk-VA/*.parquet" - config_name: "mk-XX" data_files: - split: train path: "data/mk-XX/*.parquet" - config_name: "ml-CO" data_files: - split: train path: "data/ml-CO/*.parquet" - config_name: "ml-GB" data_files: - split: train path: "data/ml-GB/*.parquet" - config_name: "ml-IN" data_files: - split: train path: "data/ml-IN/*.parquet" - config_name: "ml-IT" data_files: - split: train path: "data/ml-IT/*.parquet" - config_name: "ml-US" data_files: - split: train path: "data/ml-US/*.parquet" - config_name: "ml-VA" data_files: - split: train path: "data/ml-VA/*.parquet" - config_name: "ml-XX" data_files: - split: train path: "data/ml-XX/*.parquet" - config_name: "mn-AT" data_files: - split: train path: "data/mn-AT/*.parquet" - config_name: "mn-AU" data_files: - split: train path: "data/mn-AU/*.parquet" - config_name: "mn-BE" data_files: - split: train path: "data/mn-BE/*.parquet" - config_name: "mn-BZ" data_files: - split: train path: "data/mn-BZ/*.parquet" - config_name: "mn-CH" data_files: - split: train path: "data/mn-CH/*.parquet" - config_name: "mn-CZ" data_files: - split: train path: "data/mn-CZ/*.parquet" - config_name: "mn-DE" data_files: - split: train path: "data/mn-DE/*.parquet" - config_name: "mn-DK" data_files: - split: train path: "data/mn-DK/*.parquet" - config_name: "mn-EE" data_files: - split: train path: "data/mn-EE/*.parquet" - config_name: "mn-ES" data_files: - split: train path: "data/mn-ES/*.parquet" - config_name: "mn-EU" data_files: - split: train path: "data/mn-EU/*.parquet" - config_name: "mn-FR" data_files: - split: train path: "data/mn-FR/*.parquet" - config_name: "mn-GB" data_files: - split: train path: "data/mn-GB/*.parquet" - config_name: "mn-JP" data_files: - split: train path: "data/mn-JP/*.parquet" - config_name: "mn-KZ" data_files: - split: train path: "data/mn-KZ/*.parquet" - config_name: "mn-ME" data_files: - split: train path: "data/mn-ME/*.parquet" - config_name: "mn-MN" data_files: - split: train path: "data/mn-MN/*.parquet" - config_name: "mn-NL" data_files: - split: train path: "data/mn-NL/*.parquet" - config_name: "mn-PL" data_files: - split: train path: "data/mn-PL/*.parquet" - config_name: "mn-RU" data_files: - split: train path: "data/mn-RU/*.parquet" - config_name: "mn-TV" data_files: - split: train path: "data/mn-TV/*.parquet" - config_name: "mn-US" data_files: - split: train path: "data/mn-US/*.parquet" - config_name: "mn-XX" data_files: - split: train path: "data/mn-XX/*.parquet" - config_name: "mr-GB" data_files: - split: train path: "data/mr-GB/*.parquet" - config_name: "mr-IN" data_files: - split: train path: "data/mr-IN/*.parquet" - config_name: "mr-IT" data_files: - split: train path: "data/mr-IT/*.parquet" - config_name: "mr-US" data_files: - split: train path: "data/mr-US/*.parquet" - config_name: "mr-XX" data_files: - split: train path: "data/mr-XX/*.parquet" - config_name: "mrj-XX" data_files: - split: train path: "data/mrj-XX/*.parquet" - config_name: "ms-BN" data_files: - split: train path: "data/ms-BN/*.parquet" - config_name: "ms-ES" data_files: - split: train path: "data/ms-ES/*.parquet" - config_name: "ms-GB" data_files: - split: train path: "data/ms-GB/*.parquet" - config_name: "ms-HK" data_files: - split: train path: "data/ms-HK/*.parquet" - config_name: "ms-ID" data_files: - split: train path: "data/ms-ID/*.parquet" - config_name: "ms-MY" data_files: - split: train path: "data/ms-MY/*.parquet" - config_name: "ms-NL" data_files: - split: train path: "data/ms-NL/*.parquet" - config_name: "ms-SG" data_files: - split: train path: "data/ms-SG/*.parquet" - config_name: "ms-TV" data_files: - split: train path: "data/ms-TV/*.parquet" - config_name: "ms-US" data_files: - split: train path: "data/ms-US/*.parquet" - config_name: "ms-XX" data_files: - split: train path: "data/ms-XX/*.parquet" - config_name: "mt-DE" data_files: - split: train path: "data/mt-DE/*.parquet" - config_name: "mt-EU" data_files: - split: train path: "data/mt-EU/*.parquet" - config_name: "mt-FR" data_files: - split: train path: "data/mt-FR/*.parquet" - config_name: "mt-GB" data_files: - split: train path: "data/mt-GB/*.parquet" - config_name: "mt-MT" data_files: - split: train path: "data/mt-MT/*.parquet" - config_name: "mt-US" data_files: - split: train path: "data/mt-US/*.parquet" - config_name: "mt-XX" data_files: - split: train path: "data/mt-XX/*.parquet" - config_name: "mwl-XX" data_files: - split: train path: "data/mwl-XX/*.parquet" - config_name: "my-MM" data_files: - split: train path: "data/my-MM/*.parquet" - config_name: "my-US" data_files: - split: train path: "data/my-US/*.parquet" - config_name: "my-XX" data_files: - split: train path: "data/my-XX/*.parquet" - config_name: "mzn-XX" data_files: - split: train path: "data/mzn-XX/*.parquet" - config_name: "nah-XX" data_files: - split: train path: "data/nah-XX/*.parquet" - config_name: "nds-DE" data_files: - split: train path: "data/nds-DE/*.parquet" - config_name: "nds-XX" data_files: - split: train path: "data/nds-XX/*.parquet" - config_name: "ne-NP" data_files: - split: train path: "data/ne-NP/*.parquet" - config_name: "ne-US" data_files: - split: train path: "data/ne-US/*.parquet" - config_name: "ne-XX" data_files: - split: train path: "data/ne-XX/*.parquet" - config_name: "nl-AD" data_files: - split: train path: "data/nl-AD/*.parquet" - config_name: "nl-AE" data_files: - split: train path: "data/nl-AE/*.parquet" - config_name: "nl-AI" data_files: - split: train path: "data/nl-AI/*.parquet" - config_name: "nl-AM" data_files: - split: train path: "data/nl-AM/*.parquet" - config_name: "nl-AR" data_files: - split: train path: "data/nl-AR/*.parquet" - config_name: "nl-AS" data_files: - split: train path: "data/nl-AS/*.parquet" - config_name: "nl-AT" data_files: - split: train path: "data/nl-AT/*.parquet" - config_name: "nl-AU" data_files: - split: train path: "data/nl-AU/*.parquet" - config_name: "nl-AW" data_files: - split: train path: "data/nl-AW/*.parquet" - config_name: "nl-AZ" data_files: - split: train path: "data/nl-AZ/*.parquet" - config_name: "nl-BE" data_files: - split: train path: "data/nl-BE/*.parquet" - config_name: "nl-BG" data_files: - split: train path: "data/nl-BG/*.parquet" - config_name: "nl-BL" data_files: - split: train path: "data/nl-BL/*.parquet" - config_name: "nl-BM" data_files: - split: train path: "data/nl-BM/*.parquet" - config_name: "nl-BQ" data_files: - split: train path: "data/nl-BQ/*.parquet" - config_name: "nl-BR" data_files: - split: train path: "data/nl-BR/*.parquet" - config_name: "nl-BX" data_files: - split: train path: "data/nl-BX/*.parquet" - config_name: "nl-BZ" data_files: - split: train path: "data/nl-BZ/*.parquet" - config_name: "nl-CA" data_files: - split: train path: "data/nl-CA/*.parquet" - config_name: "nl-CC" data_files: - split: train path: "data/nl-CC/*.parquet" - config_name: "nl-CF" data_files: - split: train path: "data/nl-CF/*.parquet" - config_name: "nl-CH" data_files: - split: train path: "data/nl-CH/*.parquet" - config_name: "nl-CI" data_files: - split: train path: "data/nl-CI/*.parquet" - config_name: "nl-CL" data_files: - split: train path: "data/nl-CL/*.parquet" - config_name: "nl-CN" data_files: - split: train path: "data/nl-CN/*.parquet" - config_name: "nl-CO" data_files: - split: train path: "data/nl-CO/*.parquet" - config_name: "nl-CR" data_files: - split: train path: "data/nl-CR/*.parquet" - config_name: "nl-CW" data_files: - split: train path: "data/nl-CW/*.parquet" - config_name: "nl-CX" data_files: - split: train path: "data/nl-CX/*.parquet" - config_name: "nl-CZ" data_files: - split: train path: "data/nl-CZ/*.parquet" - config_name: "nl-DE" data_files: - split: train path: "data/nl-DE/*.parquet" - config_name: "nl-DK" data_files: - split: train path: "data/nl-DK/*.parquet" - config_name: "nl-EE" data_files: - split: train path: "data/nl-EE/*.parquet" - config_name: "nl-EN" data_files: - split: train path: "data/nl-EN/*.parquet" - config_name: "nl-ES" data_files: - split: train path: "data/nl-ES/*.parquet" - config_name: "nl-EU" data_files: - split: train path: "data/nl-EU/*.parquet" - config_name: "nl-FI" data_files: - split: train path: "data/nl-FI/*.parquet" - config_name: "nl-FM" data_files: - split: train path: "data/nl-FM/*.parquet" - config_name: "nl-FR" data_files: - split: train path: "data/nl-FR/*.parquet" - config_name: "nl-GA" data_files: - split: train path: "data/nl-GA/*.parquet" - config_name: "nl-GB" data_files: - split: train path: "data/nl-GB/*.parquet" - config_name: "nl-GQ" data_files: - split: train path: "data/nl-GQ/*.parquet" - config_name: "nl-GR" data_files: - split: train path: "data/nl-GR/*.parquet" - config_name: "nl-GX" data_files: - split: train path: "data/nl-GX/*.parquet" - config_name: "nl-HK" data_files: - split: train path: "data/nl-HK/*.parquet" - config_name: "nl-HR" data_files: - split: train path: "data/nl-HR/*.parquet" - config_name: "nl-HU" data_files: - split: train path: "data/nl-HU/*.parquet" - config_name: "nl-ID" data_files: - split: train path: "data/nl-ID/*.parquet" - config_name: "nl-IE" data_files: - split: train path: "data/nl-IE/*.parquet" - config_name: "nl-IL" data_files: - split: train path: "data/nl-IL/*.parquet" - config_name: "nl-IM" data_files: - split: train path: "data/nl-IM/*.parquet" - config_name: "nl-IN" data_files: - split: train path: "data/nl-IN/*.parquet" - config_name: "nl-IO" data_files: - split: train path: "data/nl-IO/*.parquet" - config_name: "nl-IR" data_files: - split: train path: "data/nl-IR/*.parquet" - config_name: "nl-IS" data_files: - split: train path: "data/nl-IS/*.parquet" - config_name: "nl-IT" data_files: - split: train path: "data/nl-IT/*.parquet" - config_name: "nl-JE" data_files: - split: train path: "data/nl-JE/*.parquet" - config_name: "nl-JP" data_files: - split: train path: "data/nl-JP/*.parquet" - config_name: "nl-KR" data_files: - split: train path: "data/nl-KR/*.parquet" - config_name: "nl-KW" data_files: - split: train path: "data/nl-KW/*.parquet" - config_name: "nl-LA" data_files: - split: train path: "data/nl-LA/*.parquet" - config_name: "nl-LI" data_files: - split: train path: "data/nl-LI/*.parquet" - config_name: "nl-LT" data_files: - split: train path: "data/nl-LT/*.parquet" - config_name: "nl-LU" data_files: - split: train path: "data/nl-LU/*.parquet" - config_name: "nl-LV" data_files: - split: train path: "data/nl-LV/*.parquet" - config_name: "nl-LY" data_files: - split: train path: "data/nl-LY/*.parquet" - config_name: "nl-MA" data_files: - split: train path: "data/nl-MA/*.parquet" - config_name: "nl-ME" data_files: - split: train path: "data/nl-ME/*.parquet" - config_name: "nl-ML" data_files: - split: train path: "data/nl-ML/*.parquet" - config_name: "nl-MS" data_files: - split: train path: "data/nl-MS/*.parquet" - config_name: "nl-MU" data_files: - split: train path: "data/nl-MU/*.parquet" - config_name: "nl-MX" data_files: - split: train path: "data/nl-MX/*.parquet" - config_name: "nl-MY" data_files: - split: train path: "data/nl-MY/*.parquet" - config_name: "nl-NL" data_files: - split: train path: "data/nl-NL/*.parquet" - config_name: "nl-NO" data_files: - split: train path: "data/nl-NO/*.parquet" - config_name: "nl-NU" data_files: - split: train path: "data/nl-NU/*.parquet" - config_name: "nl-PH" data_files: - split: train path: "data/nl-PH/*.parquet" - config_name: "nl-PL" data_files: - split: train path: "data/nl-PL/*.parquet" - config_name: "nl-PT" data_files: - split: train path: "data/nl-PT/*.parquet" - config_name: "nl-PW" data_files: - split: train path: "data/nl-PW/*.parquet" - config_name: "nl-RE" data_files: - split: train path: "data/nl-RE/*.parquet" - config_name: "nl-RO" data_files: - split: train path: "data/nl-RO/*.parquet" - config_name: "nl-RS" data_files: - split: train path: "data/nl-RS/*.parquet" - config_name: "nl-RU" data_files: - split: train path: "data/nl-RU/*.parquet" - config_name: "nl-RW" data_files: - split: train path: "data/nl-RW/*.parquet" - config_name: "nl-SA" data_files: - split: train path: "data/nl-SA/*.parquet" - config_name: "nl-SC" data_files: - split: train path: "data/nl-SC/*.parquet" - config_name: "nl-SE" data_files: - split: train path: "data/nl-SE/*.parquet" - config_name: "nl-SG" data_files: - split: train path: "data/nl-SG/*.parquet" - config_name: "nl-SH" data_files: - split: train path: "data/nl-SH/*.parquet" - config_name: "nl-SI" data_files: - split: train path: "data/nl-SI/*.parquet" - config_name: "nl-SK" data_files: - split: train path: "data/nl-SK/*.parquet" - config_name: "nl-SN" data_files: - split: train path: "data/nl-SN/*.parquet" - config_name: "nl-SR" data_files: - split: train path: "data/nl-SR/*.parquet" - config_name: "nl-ST" data_files: - split: train path: "data/nl-ST/*.parquet" - config_name: "nl-SX" data_files: - split: train path: "data/nl-SX/*.parquet" - config_name: "nl-TH" data_files: - split: train path: "data/nl-TH/*.parquet" - config_name: "nl-TK" data_files: - split: train path: "data/nl-TK/*.parquet" - config_name: "nl-TO" data_files: - split: train path: "data/nl-TO/*.parquet" - config_name: "nl-TR" data_files: - split: train path: "data/nl-TR/*.parquet" - config_name: "nl-TT" data_files: - split: train path: "data/nl-TT/*.parquet" - config_name: "nl-TV" data_files: - split: train path: "data/nl-TV/*.parquet" - config_name: "nl-TW" data_files: - split: train path: "data/nl-TW/*.parquet" - config_name: "nl-UA" data_files: - split: train path: "data/nl-UA/*.parquet" - config_name: "nl-UK" data_files: - split: train path: "data/nl-UK/*.parquet" - config_name: "nl-US" data_files: - split: train path: "data/nl-US/*.parquet" - config_name: "nl-VG" data_files: - split: train path: "data/nl-VG/*.parquet" - config_name: "nl-VN" data_files: - split: train path: "data/nl-VN/*.parquet" - config_name: "nl-VU" data_files: - split: train path: "data/nl-VU/*.parquet" - config_name: "nl-WS" data_files: - split: train path: "data/nl-WS/*.parquet" - config_name: "nl-XX" data_files: - split: train path: "data/nl-XX/*.parquet" - config_name: "nl-ZA" data_files: - split: train path: "data/nl-ZA/*.parquet" - config_name: "nl-ZM" data_files: - split: train path: "data/nl-ZM/*.parquet" - config_name: "nn-GB" data_files: - split: train path: "data/nn-GB/*.parquet" - config_name: "nn-LY" data_files: - split: train path: "data/nn-LY/*.parquet" - config_name: "nn-ME" data_files: - split: train path: "data/nn-ME/*.parquet" - config_name: "nn-NN" data_files: - split: train path: "data/nn-NN/*.parquet" - config_name: "nn-NO" data_files: - split: train path: "data/nn-NO/*.parquet" - config_name: "nn-US" data_files: - split: train path: "data/nn-US/*.parquet" - config_name: "nn-XX" data_files: - split: train path: "data/nn-XX/*.parquet" - config_name: "no-AI" data_files: - split: train path: "data/no-AI/*.parquet" - config_name: "no-AS" data_files: - split: train path: "data/no-AS/*.parquet" - config_name: "no-AT" data_files: - split: train path: "data/no-AT/*.parquet" - config_name: "no-AU" data_files: - split: train path: "data/no-AU/*.parquet" - config_name: "no-BE" data_files: - split: train path: "data/no-BE/*.parquet" - config_name: "no-BG" data_files: - split: train path: "data/no-BG/*.parquet" - config_name: "no-BM" data_files: - split: train path: "data/no-BM/*.parquet" - config_name: "no-BN" data_files: - split: train path: "data/no-BN/*.parquet" - config_name: "no-BR" data_files: - split: train path: "data/no-BR/*.parquet" - config_name: "no-BY" data_files: - split: train path: "data/no-BY/*.parquet" - config_name: "no-CA" data_files: - split: train path: "data/no-CA/*.parquet" - config_name: "no-CC" data_files: - split: train path: "data/no-CC/*.parquet" - config_name: "no-CH" data_files: - split: train path: "data/no-CH/*.parquet" - config_name: "no-CL" data_files: - split: train path: "data/no-CL/*.parquet" - config_name: "no-CN" data_files: - split: train path: "data/no-CN/*.parquet" - config_name: "no-CO" data_files: - split: train path: "data/no-CO/*.parquet" - config_name: "no-CZ" data_files: - split: train path: "data/no-CZ/*.parquet" - config_name: "no-DE" data_files: - split: train path: "data/no-DE/*.parquet" - config_name: "no-DK" data_files: - split: train path: "data/no-DK/*.parquet" - config_name: "no-EC" data_files: - split: train path: "data/no-EC/*.parquet" - config_name: "no-EN" data_files: - split: train path: "data/no-EN/*.parquet" - config_name: "no-ES" data_files: - split: train path: "data/no-ES/*.parquet" - config_name: "no-EU" data_files: - split: train path: "data/no-EU/*.parquet" - config_name: "no-FI" data_files: - split: train path: "data/no-FI/*.parquet" - config_name: "no-FM" data_files: - split: train path: "data/no-FM/*.parquet" - config_name: "no-FO" data_files: - split: train path: "data/no-FO/*.parquet" - config_name: "no-FR" data_files: - split: train path: "data/no-FR/*.parquet" - config_name: "no-GB" data_files: - split: train path: "data/no-GB/*.parquet" - config_name: "no-GR" data_files: - split: train path: "data/no-GR/*.parquet" - config_name: "no-HR" data_files: - split: train path: "data/no-HR/*.parquet" - config_name: "no-ID" data_files: - split: train path: "data/no-ID/*.parquet" - config_name: "no-IL" data_files: - split: train path: "data/no-IL/*.parquet" - config_name: "no-IN" data_files: - split: train path: "data/no-IN/*.parquet" - config_name: "no-IO" data_files: - split: train path: "data/no-IO/*.parquet" - config_name: "no-IR" data_files: - split: train path: "data/no-IR/*.parquet" - config_name: "no-IS" data_files: - split: train path: "data/no-IS/*.parquet" - config_name: "no-IT" data_files: - split: train path: "data/no-IT/*.parquet" - config_name: "no-JP" data_files: - split: train path: "data/no-JP/*.parquet" - config_name: "no-KR" data_files: - split: train path: "data/no-KR/*.parquet" - config_name: "no-LT" data_files: - split: train path: "data/no-LT/*.parquet" - config_name: "no-LY" data_files: - split: train path: "data/no-LY/*.parquet" - config_name: "no-MA" data_files: - split: train path: "data/no-MA/*.parquet" - config_name: "no-ME" data_files: - split: train path: "data/no-ME/*.parquet" - config_name: "no-ML" data_files: - split: train path: "data/no-ML/*.parquet" - config_name: "no-MX" data_files: - split: train path: "data/no-MX/*.parquet" - config_name: "no-NB" data_files: - split: train path: "data/no-NB/*.parquet" - config_name: "no-NL" data_files: - split: train path: "data/no-NL/*.parquet" - config_name: "no-NN" data_files: - split: train path: "data/no-NN/*.parquet" - config_name: "no-NO" data_files: - split: train path: "data/no-NO/*.parquet" - config_name: "no-NU" data_files: - split: train path: "data/no-NU/*.parquet" - config_name: "no-NZ" data_files: - split: train path: "data/no-NZ/*.parquet" - config_name: "no-PE" data_files: - split: train path: "data/no-PE/*.parquet" - config_name: "no-PL" data_files: - split: train path: "data/no-PL/*.parquet" - config_name: "no-PT" data_files: - split: train path: "data/no-PT/*.parquet" - config_name: "no-PW" data_files: - split: train path: "data/no-PW/*.parquet" - config_name: "no-PY" data_files: - split: train path: "data/no-PY/*.parquet" - config_name: "no-RO" data_files: - split: train path: "data/no-RO/*.parquet" - config_name: "no-RS" data_files: - split: train path: "data/no-RS/*.parquet" - config_name: "no-RU" data_files: - split: train path: "data/no-RU/*.parquet" - config_name: "no-SE" data_files: - split: train path: "data/no-SE/*.parquet" - config_name: "no-SH" data_files: - split: train path: "data/no-SH/*.parquet" - config_name: "no-SK" data_files: - split: train path: "data/no-SK/*.parquet" - config_name: "no-TK" data_files: - split: train path: "data/no-TK/*.parquet" - config_name: "no-TR" data_files: - split: train path: "data/no-TR/*.parquet" - config_name: "no-TV" data_files: - split: train path: "data/no-TV/*.parquet" - config_name: "no-TW" data_files: - split: train path: "data/no-TW/*.parquet" - config_name: "no-UA" data_files: - split: train path: "data/no-UA/*.parquet" - config_name: "no-UK" data_files: - split: train path: "data/no-UK/*.parquet" - config_name: "no-US" data_files: - split: train path: "data/no-US/*.parquet" - config_name: "no-VA" data_files: - split: train path: "data/no-VA/*.parquet" - config_name: "no-VN" data_files: - split: train path: "data/no-VN/*.parquet" - config_name: "no-XX" data_files: - split: train path: "data/no-XX/*.parquet" - config_name: "no-ZA" data_files: - split: train path: "data/no-ZA/*.parquet" - config_name: "oc-EU" data_files: - split: train path: "data/oc-EU/*.parquet" - config_name: "oc-FR" data_files: - split: train path: "data/oc-FR/*.parquet" - config_name: "oc-GB" data_files: - split: train path: "data/oc-GB/*.parquet" - config_name: "oc-XX" data_files: - split: train path: "data/oc-XX/*.parquet" - config_name: "or-IN" data_files: - split: train path: "data/or-IN/*.parquet" - config_name: "or-US" data_files: - split: train path: "data/or-US/*.parquet" - config_name: "or-XX" data_files: - split: train path: "data/or-XX/*.parquet" - config_name: "os-EU" data_files: - split: train path: "data/os-EU/*.parquet" - config_name: "os-RU" data_files: - split: train path: "data/os-RU/*.parquet" - config_name: "os-TV" data_files: - split: train path: "data/os-TV/*.parquet" - config_name: "os-US" data_files: - split: train path: "data/os-US/*.parquet" - config_name: "os-XX" data_files: - split: train path: "data/os-XX/*.parquet" - config_name: "pa-GB" data_files: - split: train path: "data/pa-GB/*.parquet" - config_name: "pa-IN" data_files: - split: train path: "data/pa-IN/*.parquet" - config_name: "pa-US" data_files: - split: train path: "data/pa-US/*.parquet" - config_name: "pa-XX" data_files: - split: train path: "data/pa-XX/*.parquet" - config_name: "pl-AA" data_files: - split: train path: "data/pl-AA/*.parquet" - config_name: "pl-AI" data_files: - split: train path: "data/pl-AI/*.parquet" - config_name: "pl-AL" data_files: - split: train path: "data/pl-AL/*.parquet" - config_name: "pl-AM" data_files: - split: train path: "data/pl-AM/*.parquet" - config_name: "pl-AR" data_files: - split: train path: "data/pl-AR/*.parquet" - config_name: "pl-AT" data_files: - split: train path: "data/pl-AT/*.parquet" - config_name: "pl-AU" data_files: - split: train path: "data/pl-AU/*.parquet" - config_name: "pl-BA" data_files: - split: train path: "data/pl-BA/*.parquet" - config_name: "pl-BE" data_files: - split: train path: "data/pl-BE/*.parquet" - config_name: "pl-BG" data_files: - split: train path: "data/pl-BG/*.parquet" - config_name: "pl-BM" data_files: - split: train path: "data/pl-BM/*.parquet" - config_name: "pl-BR" data_files: - split: train path: "data/pl-BR/*.parquet" - config_name: "pl-BY" data_files: - split: train path: "data/pl-BY/*.parquet" - config_name: "pl-CA" data_files: - split: train path: "data/pl-CA/*.parquet" - config_name: "pl-CC" data_files: - split: train path: "data/pl-CC/*.parquet" - config_name: "pl-CD" data_files: - split: train path: "data/pl-CD/*.parquet" - config_name: "pl-CF" data_files: - split: train path: "data/pl-CF/*.parquet" - config_name: "pl-CH" data_files: - split: train path: "data/pl-CH/*.parquet" - config_name: "pl-CL" data_files: - split: train path: "data/pl-CL/*.parquet" - config_name: "pl-CN" data_files: - split: train path: "data/pl-CN/*.parquet" - config_name: "pl-CO" data_files: - split: train path: "data/pl-CO/*.parquet" - config_name: "pl-CY" data_files: - split: train path: "data/pl-CY/*.parquet" - config_name: "pl-CZ" data_files: - split: train path: "data/pl-CZ/*.parquet" - config_name: "pl-DE" data_files: - split: train path: "data/pl-DE/*.parquet" - config_name: "pl-DK" data_files: - split: train path: "data/pl-DK/*.parquet" - config_name: "pl-DO" data_files: - split: train path: "data/pl-DO/*.parquet" - config_name: "pl-EC" data_files: - split: train path: "data/pl-EC/*.parquet" - config_name: "pl-EE" data_files: - split: train path: "data/pl-EE/*.parquet" - config_name: "pl-EG" data_files: - split: train path: "data/pl-EG/*.parquet" - config_name: "pl-EN" data_files: - split: train path: "data/pl-EN/*.parquet" - config_name: "pl-ES" data_files: - split: train path: "data/pl-ES/*.parquet" - config_name: "pl-EU" data_files: - split: train path: "data/pl-EU/*.parquet" - config_name: "pl-FI" data_files: - split: train path: "data/pl-FI/*.parquet" - config_name: "pl-FM" data_files: - split: train path: "data/pl-FM/*.parquet" - config_name: "pl-FR" data_files: - split: train path: "data/pl-FR/*.parquet" - config_name: "pl-GA" data_files: - split: train path: "data/pl-GA/*.parquet" - config_name: "pl-GB" data_files: - split: train path: "data/pl-GB/*.parquet" - config_name: "pl-GD" data_files: - split: train path: "data/pl-GD/*.parquet" - config_name: "pl-GE" data_files: - split: train path: "data/pl-GE/*.parquet" - config_name: "pl-GP" data_files: - split: train path: "data/pl-GP/*.parquet" - config_name: "pl-GR" data_files: - split: train path: "data/pl-GR/*.parquet" - config_name: "pl-HK" data_files: - split: train path: "data/pl-HK/*.parquet" - config_name: "pl-HR" data_files: - split: train path: "data/pl-HR/*.parquet" - config_name: "pl-HT" data_files: - split: train path: "data/pl-HT/*.parquet" - config_name: "pl-HU" data_files: - split: train path: "data/pl-HU/*.parquet" - config_name: "pl-ID" data_files: - split: train path: "data/pl-ID/*.parquet" - config_name: "pl-IE" data_files: - split: train path: "data/pl-IE/*.parquet" - config_name: "pl-IL" data_files: - split: train path: "data/pl-IL/*.parquet" - config_name: "pl-IN" data_files: - split: train path: "data/pl-IN/*.parquet" - config_name: "pl-IO" data_files: - split: train path: "data/pl-IO/*.parquet" - config_name: "pl-IR" data_files: - split: train path: "data/pl-IR/*.parquet" - config_name: "pl-IS" data_files: - split: train path: "data/pl-IS/*.parquet" - config_name: "pl-IT" data_files: - split: train path: "data/pl-IT/*.parquet" - config_name: "pl-JP" data_files: - split: train path: "data/pl-JP/*.parquet" - config_name: "pl-KE" data_files: - split: train path: "data/pl-KE/*.parquet" - config_name: "pl-KR" data_files: - split: train path: "data/pl-KR/*.parquet" - config_name: "pl-LA" data_files: - split: train path: "data/pl-LA/*.parquet" - config_name: "pl-LT" data_files: - split: train path: "data/pl-LT/*.parquet" - config_name: "pl-LU" data_files: - split: train path: "data/pl-LU/*.parquet" - config_name: "pl-LV" data_files: - split: train path: "data/pl-LV/*.parquet" - config_name: "pl-ME" data_files: - split: train path: "data/pl-ME/*.parquet" - config_name: "pl-ML" data_files: - split: train path: "data/pl-ML/*.parquet" - config_name: "pl-MX" data_files: - split: train path: "data/pl-MX/*.parquet" - config_name: "pl-MY" data_files: - split: train path: "data/pl-MY/*.parquet" - config_name: "pl-NF" data_files: - split: train path: "data/pl-NF/*.parquet" - config_name: "pl-NL" data_files: - split: train path: "data/pl-NL/*.parquet" - config_name: "pl-NO" data_files: - split: train path: "data/pl-NO/*.parquet" - config_name: "pl-NZ" data_files: - split: train path: "data/pl-NZ/*.parquet" - config_name: "pl-PE" data_files: - split: train path: "data/pl-PE/*.parquet" - config_name: "pl-PH" data_files: - split: train path: "data/pl-PH/*.parquet" - config_name: "pl-PK" data_files: - split: train path: "data/pl-PK/*.parquet" - config_name: "pl-PL" data_files: - split: train path: "data/pl-PL/*.parquet" - config_name: "pl-PO" data_files: - split: train path: "data/pl-PO/*.parquet" - config_name: "pl-PT" data_files: - split: train path: "data/pl-PT/*.parquet" - config_name: "pl-PW" data_files: - split: train path: "data/pl-PW/*.parquet" - config_name: "pl-RO" data_files: - split: train path: "data/pl-RO/*.parquet" - config_name: "pl-RS" data_files: - split: train path: "data/pl-RS/*.parquet" - config_name: "pl-RU" data_files: - split: train path: "data/pl-RU/*.parquet" - config_name: "pl-SA" data_files: - split: train path: "data/pl-SA/*.parquet" - config_name: "pl-SE" data_files: - split: train path: "data/pl-SE/*.parquet" - config_name: "pl-SH" data_files: - split: train path: "data/pl-SH/*.parquet" - config_name: "pl-SI" data_files: - split: train path: "data/pl-SI/*.parquet" - config_name: "pl-SK" data_files: - split: train path: "data/pl-SK/*.parquet" - config_name: "pl-TK" data_files: - split: train path: "data/pl-TK/*.parquet" - config_name: "pl-TL" data_files: - split: train path: "data/pl-TL/*.parquet" - config_name: "pl-TO" data_files: - split: train path: "data/pl-TO/*.parquet" - config_name: "pl-TR" data_files: - split: train path: "data/pl-TR/*.parquet" - config_name: "pl-TV" data_files: - split: train path: "data/pl-TV/*.parquet" - config_name: "pl-TW" data_files: - split: train path: "data/pl-TW/*.parquet" - config_name: "pl-UA" data_files: - split: train path: "data/pl-UA/*.parquet" - config_name: "pl-UK" data_files: - split: train path: "data/pl-UK/*.parquet" - config_name: "pl-US" data_files: - split: train path: "data/pl-US/*.parquet" - config_name: "pl-VA" data_files: - split: train path: "data/pl-VA/*.parquet" - config_name: "pl-VN" data_files: - split: train path: "data/pl-VN/*.parquet" - config_name: "pl-WS" data_files: - split: train path: "data/pl-WS/*.parquet" - config_name: "pl-XX" data_files: - split: train path: "data/pl-XX/*.parquet" - config_name: "pl-YT" data_files: - split: train path: "data/pl-YT/*.parquet" - config_name: "pl-ZA" data_files: - split: train path: "data/pl-ZA/*.parquet" - config_name: "pms-GB" data_files: - split: train path: "data/pms-GB/*.parquet" - config_name: "pms-XX" data_files: - split: train path: "data/pms-XX/*.parquet" - config_name: "pnb-PK" data_files: - split: train path: "data/pnb-PK/*.parquet" - config_name: "pnb-US" data_files: - split: train path: "data/pnb-US/*.parquet" - config_name: "pnb-XX" data_files: - split: train path: "data/pnb-XX/*.parquet" - config_name: "ps-AF" data_files: - split: train path: "data/ps-AF/*.parquet" - config_name: "ps-CA" data_files: - split: train path: "data/ps-CA/*.parquet" - config_name: "ps-CN" data_files: - split: train path: "data/ps-CN/*.parquet" - config_name: "ps-CO" data_files: - split: train path: "data/ps-CO/*.parquet" - config_name: "ps-DE" data_files: - split: train path: "data/ps-DE/*.parquet" - config_name: "ps-FR" data_files: - split: train path: "data/ps-FR/*.parquet" - config_name: "ps-GB" data_files: - split: train path: "data/ps-GB/*.parquet" - config_name: "ps-IR" data_files: - split: train path: "data/ps-IR/*.parquet" - config_name: "ps-KZ" data_files: - split: train path: "data/ps-KZ/*.parquet" - config_name: "ps-NL" data_files: - split: train path: "data/ps-NL/*.parquet" - config_name: "ps-PA" data_files: - split: train path: "data/ps-PA/*.parquet" - config_name: "ps-PK" data_files: - split: train path: "data/ps-PK/*.parquet" - config_name: "ps-PS" data_files: - split: train path: "data/ps-PS/*.parquet" - config_name: "ps-TR" data_files: - split: train path: "data/ps-TR/*.parquet" - config_name: "ps-TV" data_files: - split: train path: "data/ps-TV/*.parquet" - config_name: "ps-US" data_files: - split: train path: "data/ps-US/*.parquet" - config_name: "ps-UZ" data_files: - split: train path: "data/ps-UZ/*.parquet" - config_name: "ps-XX" data_files: - split: train path: "data/ps-XX/*.parquet" - config_name: "pt-AE" data_files: - split: train path: "data/pt-AE/*.parquet" - config_name: "pt-AG" data_files: - split: train path: "data/pt-AG/*.parquet" - config_name: "pt-AI" data_files: - split: train path: "data/pt-AI/*.parquet" - config_name: "pt-AL" data_files: - split: train path: "data/pt-AL/*.parquet" - config_name: "pt-AM" data_files: - split: train path: "data/pt-AM/*.parquet" - config_name: "pt-AO" data_files: - split: train path: "data/pt-AO/*.parquet" - config_name: "pt-AR" data_files: - split: train path: "data/pt-AR/*.parquet" - config_name: "pt-AT" data_files: - split: train path: "data/pt-AT/*.parquet" - config_name: "pt-AU" data_files: - split: train path: "data/pt-AU/*.parquet" - config_name: "pt-BA" data_files: - split: train path: "data/pt-BA/*.parquet" - config_name: "pt-BD" data_files: - split: train path: "data/pt-BD/*.parquet" - config_name: "pt-BE" data_files: - split: train path: "data/pt-BE/*.parquet" - config_name: "pt-BG" data_files: - split: train path: "data/pt-BG/*.parquet" - config_name: "pt-BM" data_files: - split: train path: "data/pt-BM/*.parquet" - config_name: "pt-BO" data_files: - split: train path: "data/pt-BO/*.parquet" - config_name: "pt-BR" data_files: - split: train path: "data/pt-BR/*.parquet" - config_name: "pt-BT" data_files: - split: train path: "data/pt-BT/*.parquet" - config_name: "pt-BY" data_files: - split: train path: "data/pt-BY/*.parquet" - config_name: "pt-BZ" data_files: - split: train path: "data/pt-BZ/*.parquet" - config_name: "pt-CA" data_files: - split: train path: "data/pt-CA/*.parquet" - config_name: "pt-CC" data_files: - split: train path: "data/pt-CC/*.parquet" - config_name: "pt-CF" data_files: - split: train path: "data/pt-CF/*.parquet" - config_name: "pt-CH" data_files: - split: train path: "data/pt-CH/*.parquet" - config_name: "pt-CL" data_files: - split: train path: "data/pt-CL/*.parquet" - config_name: "pt-CN" data_files: - split: train path: "data/pt-CN/*.parquet" - config_name: "pt-CO" data_files: - split: train path: "data/pt-CO/*.parquet" - config_name: "pt-CR" data_files: - split: train path: "data/pt-CR/*.parquet" - config_name: "pt-CU" data_files: - split: train path: "data/pt-CU/*.parquet" - config_name: "pt-CV" data_files: - split: train path: "data/pt-CV/*.parquet" - config_name: "pt-CY" data_files: - split: train path: "data/pt-CY/*.parquet" - config_name: "pt-CZ" data_files: - split: train path: "data/pt-CZ/*.parquet" - config_name: "pt-DE" data_files: - split: train path: "data/pt-DE/*.parquet" - config_name: "pt-DJ" data_files: - split: train path: "data/pt-DJ/*.parquet" - config_name: "pt-DK" data_files: - split: train path: "data/pt-DK/*.parquet" - config_name: "pt-DO" data_files: - split: train path: "data/pt-DO/*.parquet" - config_name: "pt-EN" data_files: - split: train path: "data/pt-EN/*.parquet" - config_name: "pt-ES" data_files: - split: train path: "data/pt-ES/*.parquet" - config_name: "pt-EU" data_files: - split: train path: "data/pt-EU/*.parquet" - config_name: "pt-FI" data_files: - split: train path: "data/pt-FI/*.parquet" - config_name: "pt-FM" data_files: - split: train path: "data/pt-FM/*.parquet" - config_name: "pt-FO" data_files: - split: train path: "data/pt-FO/*.parquet" - config_name: "pt-FR" data_files: - split: train path: "data/pt-FR/*.parquet" - config_name: "pt-GA" data_files: - split: train path: "data/pt-GA/*.parquet" - config_name: "pt-GB" data_files: - split: train path: "data/pt-GB/*.parquet" - config_name: "pt-GE" data_files: - split: train path: "data/pt-GE/*.parquet" - config_name: "pt-GG" data_files: - split: train path: "data/pt-GG/*.parquet" - config_name: "pt-GL" data_files: - split: train path: "data/pt-GL/*.parquet" - config_name: "pt-GQ" data_files: - split: train path: "data/pt-GQ/*.parquet" - config_name: "pt-GR" data_files: - split: train path: "data/pt-GR/*.parquet" - config_name: "pt-GW" data_files: - split: train path: "data/pt-GW/*.parquet" - config_name: "pt-GX" data_files: - split: train path: "data/pt-GX/*.parquet" - config_name: "pt-GY" data_files: - split: train path: "data/pt-GY/*.parquet" - config_name: "pt-GZ" data_files: - split: train path: "data/pt-GZ/*.parquet" - config_name: "pt-HK" data_files: - split: train path: "data/pt-HK/*.parquet" - config_name: "pt-HU" data_files: - split: train path: "data/pt-HU/*.parquet" - config_name: "pt-ID" data_files: - split: train path: "data/pt-ID/*.parquet" - config_name: "pt-IE" data_files: - split: train path: "data/pt-IE/*.parquet" - config_name: "pt-IL" data_files: - split: train path: "data/pt-IL/*.parquet" - config_name: "pt-IM" data_files: - split: train path: "data/pt-IM/*.parquet" - config_name: "pt-IN" data_files: - split: train path: "data/pt-IN/*.parquet" - config_name: "pt-IO" data_files: - split: train path: "data/pt-IO/*.parquet" - config_name: "pt-IR" data_files: - split: train path: "data/pt-IR/*.parquet" - config_name: "pt-IS" data_files: - split: train path: "data/pt-IS/*.parquet" - config_name: "pt-IT" data_files: - split: train path: "data/pt-IT/*.parquet" - config_name: "pt-JM" data_files: - split: train path: "data/pt-JM/*.parquet" - config_name: "pt-JP" data_files: - split: train path: "data/pt-JP/*.parquet" - config_name: "pt-KE" data_files: - split: train path: "data/pt-KE/*.parquet" - config_name: "pt-KR" data_files: - split: train path: "data/pt-KR/*.parquet" - config_name: "pt-LA" data_files: - split: train path: "data/pt-LA/*.parquet" - config_name: "pt-LI" data_files: - split: train path: "data/pt-LI/*.parquet" - config_name: "pt-LT" data_files: - split: train path: "data/pt-LT/*.parquet" - config_name: "pt-LU" data_files: - split: train path: "data/pt-LU/*.parquet" - config_name: "pt-MA" data_files: - split: train path: "data/pt-MA/*.parquet" - config_name: "pt-MC" data_files: - split: train path: "data/pt-MC/*.parquet" - config_name: "pt-ME" data_files: - split: train path: "data/pt-ME/*.parquet" - config_name: "pt-ML" data_files: - split: train path: "data/pt-ML/*.parquet" - config_name: "pt-MN" data_files: - split: train path: "data/pt-MN/*.parquet" - config_name: "pt-MO" data_files: - split: train path: "data/pt-MO/*.parquet" - config_name: "pt-MP" data_files: - split: train path: "data/pt-MP/*.parquet" - config_name: "pt-MS" data_files: - split: train path: "data/pt-MS/*.parquet" - config_name: "pt-MX" data_files: - split: train path: "data/pt-MX/*.parquet" - config_name: "pt-MY" data_files: - split: train path: "data/pt-MY/*.parquet" - config_name: "pt-MZ" data_files: - split: train path: "data/pt-MZ/*.parquet" - config_name: "pt-NL" data_files: - split: train path: "data/pt-NL/*.parquet" - config_name: "pt-NO" data_files: - split: train path: "data/pt-NO/*.parquet" - config_name: "pt-NZ" data_files: - split: train path: "data/pt-NZ/*.parquet" - config_name: "pt-PA" data_files: - split: train path: "data/pt-PA/*.parquet" - config_name: "pt-PE" data_files: - split: train path: "data/pt-PE/*.parquet" - config_name: "pt-PH" data_files: - split: train path: "data/pt-PH/*.parquet" - config_name: "pt-PL" data_files: - split: train path: "data/pt-PL/*.parquet" - config_name: "pt-PM" data_files: - split: train path: "data/pt-PM/*.parquet" - config_name: "pt-PR" data_files: - split: train path: "data/pt-PR/*.parquet" - config_name: "pt-PT" data_files: - split: train path: "data/pt-PT/*.parquet" - config_name: "pt-PW" data_files: - split: train path: "data/pt-PW/*.parquet" - config_name: "pt-PY" data_files: - split: train path: "data/pt-PY/*.parquet" - config_name: "pt-RO" data_files: - split: train path: "data/pt-RO/*.parquet" - config_name: "pt-RS" data_files: - split: train path: "data/pt-RS/*.parquet" - config_name: "pt-RU" data_files: - split: train path: "data/pt-RU/*.parquet" - config_name: "pt-SA" data_files: - split: train path: "data/pt-SA/*.parquet" - config_name: "pt-SE" data_files: - split: train path: "data/pt-SE/*.parquet" - config_name: "pt-SI" data_files: - split: train path: "data/pt-SI/*.parquet" - config_name: "pt-SK" data_files: - split: train path: "data/pt-SK/*.parquet" - config_name: "pt-SM" data_files: - split: train path: "data/pt-SM/*.parquet" - config_name: "pt-SO" data_files: - split: train path: "data/pt-SO/*.parquet" - config_name: "pt-SP" data_files: - split: train path: "data/pt-SP/*.parquet" - config_name: "pt-ST" data_files: - split: train path: "data/pt-ST/*.parquet" - config_name: "pt-TH" data_files: - split: train path: "data/pt-TH/*.parquet" - config_name: "pt-TK" data_files: - split: train path: "data/pt-TK/*.parquet" - config_name: "pt-TL" data_files: - split: train path: "data/pt-TL/*.parquet" - config_name: "pt-TO" data_files: - split: train path: "data/pt-TO/*.parquet" - config_name: "pt-TR" data_files: - split: train path: "data/pt-TR/*.parquet" - config_name: "pt-TV" data_files: - split: train path: "data/pt-TV/*.parquet" - config_name: "pt-TW" data_files: - split: train path: "data/pt-TW/*.parquet" - config_name: "pt-UA" data_files: - split: train path: "data/pt-UA/*.parquet" - config_name: "pt-UK" data_files: - split: train path: "data/pt-UK/*.parquet" - config_name: "pt-US" data_files: - split: train path: "data/pt-US/*.parquet" - config_name: "pt-UY" data_files: - split: train path: "data/pt-UY/*.parquet" - config_name: "pt-VA" data_files: - split: train path: "data/pt-VA/*.parquet" - config_name: "pt-VC" data_files: - split: train path: "data/pt-VC/*.parquet" - config_name: "pt-VN" data_files: - split: train path: "data/pt-VN/*.parquet" - config_name: "pt-VU" data_files: - split: train path: "data/pt-VU/*.parquet" - config_name: "pt-WF" data_files: - split: train path: "data/pt-WF/*.parquet" - config_name: "pt-WP" data_files: - split: train path: "data/pt-WP/*.parquet" - config_name: "pt-WS" data_files: - split: train path: "data/pt-WS/*.parquet" - config_name: "pt-XX" data_files: - split: train path: "data/pt-XX/*.parquet" - config_name: "pt-ZA" data_files: - split: train path: "data/pt-ZA/*.parquet" - config_name: "qu-XX" data_files: - split: train path: "data/qu-XX/*.parquet" - config_name: "rm-CH" data_files: - split: train path: "data/rm-CH/*.parquet" - config_name: "ro-AD" data_files: - split: train path: "data/ro-AD/*.parquet" - config_name: "ro-AI" data_files: - split: train path: "data/ro-AI/*.parquet" - config_name: "ro-AM" data_files: - split: train path: "data/ro-AM/*.parquet" - config_name: "ro-AR" data_files: - split: train path: "data/ro-AR/*.parquet" - config_name: "ro-AT" data_files: - split: train path: "data/ro-AT/*.parquet" - config_name: "ro-AU" data_files: - split: train path: "data/ro-AU/*.parquet" - config_name: "ro-BA" data_files: - split: train path: "data/ro-BA/*.parquet" - config_name: "ro-BE" data_files: - split: train path: "data/ro-BE/*.parquet" - config_name: "ro-BG" data_files: - split: train path: "data/ro-BG/*.parquet" - config_name: "ro-BR" data_files: - split: train path: "data/ro-BR/*.parquet" - config_name: "ro-BZ" data_files: - split: train path: "data/ro-BZ/*.parquet" - config_name: "ro-CA" data_files: - split: train path: "data/ro-CA/*.parquet" - config_name: "ro-CC" data_files: - split: train path: "data/ro-CC/*.parquet" - config_name: "ro-CF" data_files: - split: train path: "data/ro-CF/*.parquet" - config_name: "ro-CH" data_files: - split: train path: "data/ro-CH/*.parquet" - config_name: "ro-CL" data_files: - split: train path: "data/ro-CL/*.parquet" - config_name: "ro-CN" data_files: - split: train path: "data/ro-CN/*.parquet" - config_name: "ro-CO" data_files: - split: train path: "data/ro-CO/*.parquet" - config_name: "ro-CZ" data_files: - split: train path: "data/ro-CZ/*.parquet" - config_name: "ro-DE" data_files: - split: train path: "data/ro-DE/*.parquet" - config_name: "ro-DK" data_files: - split: train path: "data/ro-DK/*.parquet" - config_name: "ro-EE" data_files: - split: train path: "data/ro-EE/*.parquet" - config_name: "ro-ES" data_files: - split: train path: "data/ro-ES/*.parquet" - config_name: "ro-EU" data_files: - split: train path: "data/ro-EU/*.parquet" - config_name: "ro-FM" data_files: - split: train path: "data/ro-FM/*.parquet" - config_name: "ro-FR" data_files: - split: train path: "data/ro-FR/*.parquet" - config_name: "ro-GA" data_files: - split: train path: "data/ro-GA/*.parquet" - config_name: "ro-GB" data_files: - split: train path: "data/ro-GB/*.parquet" - config_name: "ro-GL" data_files: - split: train path: "data/ro-GL/*.parquet" - config_name: "ro-GQ" data_files: - split: train path: "data/ro-GQ/*.parquet" - config_name: "ro-GR" data_files: - split: train path: "data/ro-GR/*.parquet" - config_name: "ro-HR" data_files: - split: train path: "data/ro-HR/*.parquet" - config_name: "ro-HU" data_files: - split: train path: "data/ro-HU/*.parquet" - config_name: "ro-IE" data_files: - split: train path: "data/ro-IE/*.parquet" - config_name: "ro-IM" data_files: - split: train path: "data/ro-IM/*.parquet" - config_name: "ro-IN" data_files: - split: train path: "data/ro-IN/*.parquet" - config_name: "ro-IO" data_files: - split: train path: "data/ro-IO/*.parquet" - config_name: "ro-IR" data_files: - split: train path: "data/ro-IR/*.parquet" - config_name: "ro-IT" data_files: - split: train path: "data/ro-IT/*.parquet" - config_name: "ro-JP" data_files: - split: train path: "data/ro-JP/*.parquet" - config_name: "ro-KZ" data_files: - split: train path: "data/ro-KZ/*.parquet" - config_name: "ro-LA" data_files: - split: train path: "data/ro-LA/*.parquet" - config_name: "ro-LU" data_files: - split: train path: "data/ro-LU/*.parquet" - config_name: "ro-MD" data_files: - split: train path: "data/ro-MD/*.parquet" - config_name: "ro-ME" data_files: - split: train path: "data/ro-ME/*.parquet" - config_name: "ro-MK" data_files: - split: train path: "data/ro-MK/*.parquet" - config_name: "ro-ML" data_files: - split: train path: "data/ro-ML/*.parquet" - config_name: "ro-NL" data_files: - split: train path: "data/ro-NL/*.parquet" - config_name: "ro-NO" data_files: - split: train path: "data/ro-NO/*.parquet" - config_name: "ro-PL" data_files: - split: train path: "data/ro-PL/*.parquet" - config_name: "ro-PT" data_files: - split: train path: "data/ro-PT/*.parquet" - config_name: "ro-RO" data_files: - split: train path: "data/ro-RO/*.parquet" - config_name: "ro-RS" data_files: - split: train path: "data/ro-RS/*.parquet" - config_name: "ro-RU" data_files: - split: train path: "data/ro-RU/*.parquet" - config_name: "ro-SE" data_files: - split: train path: "data/ro-SE/*.parquet" - config_name: "ro-SK" data_files: - split: train path: "data/ro-SK/*.parquet" - config_name: "ro-SO" data_files: - split: train path: "data/ro-SO/*.parquet" - config_name: "ro-ST" data_files: - split: train path: "data/ro-ST/*.parquet" - config_name: "ro-TK" data_files: - split: train path: "data/ro-TK/*.parquet" - config_name: "ro-TL" data_files: - split: train path: "data/ro-TL/*.parquet" - config_name: "ro-TO" data_files: - split: train path: "data/ro-TO/*.parquet" - config_name: "ro-TV" data_files: - split: train path: "data/ro-TV/*.parquet" - config_name: "ro-UA" data_files: - split: train path: "data/ro-UA/*.parquet" - config_name: "ro-US" data_files: - split: train path: "data/ro-US/*.parquet" - config_name: "ro-VA" data_files: - split: train path: "data/ro-VA/*.parquet" - config_name: "ro-VN" data_files: - split: train path: "data/ro-VN/*.parquet" - config_name: "ro-WS" data_files: - split: train path: "data/ro-WS/*.parquet" - config_name: "ro-XX" data_files: - split: train path: "data/ro-XX/*.parquet" - config_name: "ru-AE" data_files: - split: train path: "data/ru-AE/*.parquet" - config_name: "ru-AG" data_files: - split: train path: "data/ru-AG/*.parquet" - config_name: "ru-AI" data_files: - split: train path: "data/ru-AI/*.parquet" - config_name: "ru-AL" data_files: - split: train path: "data/ru-AL/*.parquet" - config_name: "ru-AM" data_files: - split: train path: "data/ru-AM/*.parquet" - config_name: "ru-AR" data_files: - split: train path: "data/ru-AR/*.parquet" - config_name: "ru-AT" data_files: - split: train path: "data/ru-AT/*.parquet" - config_name: "ru-AU" data_files: - split: train path: "data/ru-AU/*.parquet" - config_name: "ru-AZ" data_files: - split: train path: "data/ru-AZ/*.parquet" - config_name: "ru-BA" data_files: - split: train path: "data/ru-BA/*.parquet" - config_name: "ru-BE" data_files: - split: train path: "data/ru-BE/*.parquet" - config_name: "ru-BG" data_files: - split: train path: "data/ru-BG/*.parquet" - config_name: "ru-BI" data_files: - split: train path: "data/ru-BI/*.parquet" - config_name: "ru-BM" data_files: - split: train path: "data/ru-BM/*.parquet" - config_name: "ru-BO" data_files: - split: train path: "data/ru-BO/*.parquet" - config_name: "ru-BR" data_files: - split: train path: "data/ru-BR/*.parquet" - config_name: "ru-BY" data_files: - split: train path: "data/ru-BY/*.parquet" - config_name: "ru-BZ" data_files: - split: train path: "data/ru-BZ/*.parquet" - config_name: "ru-CA" data_files: - split: train path: "data/ru-CA/*.parquet" - config_name: "ru-CC" data_files: - split: train path: "data/ru-CC/*.parquet" - config_name: "ru-CD" data_files: - split: train path: "data/ru-CD/*.parquet" - config_name: "ru-CF" data_files: - split: train path: "data/ru-CF/*.parquet" - config_name: "ru-CH" data_files: - split: train path: "data/ru-CH/*.parquet" - config_name: "ru-CL" data_files: - split: train path: "data/ru-CL/*.parquet" - config_name: "ru-CN" data_files: - split: train path: "data/ru-CN/*.parquet" - config_name: "ru-CO" data_files: - split: train path: "data/ru-CO/*.parquet" - config_name: "ru-CU" data_files: - split: train path: "data/ru-CU/*.parquet" - config_name: "ru-CX" data_files: - split: train path: "data/ru-CX/*.parquet" - config_name: "ru-CY" data_files: - split: train path: "data/ru-CY/*.parquet" - config_name: "ru-CZ" data_files: - split: train path: "data/ru-CZ/*.parquet" - config_name: "ru-DE" data_files: - split: train path: "data/ru-DE/*.parquet" - config_name: "ru-DJ" data_files: - split: train path: "data/ru-DJ/*.parquet" - config_name: "ru-DK" data_files: - split: train path: "data/ru-DK/*.parquet" - config_name: "ru-DM" data_files: - split: train path: "data/ru-DM/*.parquet" - config_name: "ru-DO" data_files: - split: train path: "data/ru-DO/*.parquet" - config_name: "ru-DZ" data_files: - split: train path: "data/ru-DZ/*.parquet" - config_name: "ru-EC" data_files: - split: train path: "data/ru-EC/*.parquet" - config_name: "ru-EE" data_files: - split: train path: "data/ru-EE/*.parquet" - config_name: "ru-EN" data_files: - split: train path: "data/ru-EN/*.parquet" - config_name: "ru-ES" data_files: - split: train path: "data/ru-ES/*.parquet" - config_name: "ru-EU" data_files: - split: train path: "data/ru-EU/*.parquet" - config_name: "ru-EX" data_files: - split: train path: "data/ru-EX/*.parquet" - config_name: "ru-FI" data_files: - split: train path: "data/ru-FI/*.parquet" - config_name: "ru-FM" data_files: - split: train path: "data/ru-FM/*.parquet" - config_name: "ru-FR" data_files: - split: train path: "data/ru-FR/*.parquet" - config_name: "ru-GA" data_files: - split: train path: "data/ru-GA/*.parquet" - config_name: "ru-GB" data_files: - split: train path: "data/ru-GB/*.parquet" - config_name: "ru-GE" data_files: - split: train path: "data/ru-GE/*.parquet" - config_name: "ru-GG" data_files: - split: train path: "data/ru-GG/*.parquet" - config_name: "ru-GQ" data_files: - split: train path: "data/ru-GQ/*.parquet" - config_name: "ru-GR" data_files: - split: train path: "data/ru-GR/*.parquet" - config_name: "ru-HK" data_files: - split: train path: "data/ru-HK/*.parquet" - config_name: "ru-HR" data_files: - split: train path: "data/ru-HR/*.parquet" - config_name: "ru-HT" data_files: - split: train path: "data/ru-HT/*.parquet" - config_name: "ru-HU" data_files: - split: train path: "data/ru-HU/*.parquet" - config_name: "ru-ID" data_files: - split: train path: "data/ru-ID/*.parquet" - config_name: "ru-IE" data_files: - split: train path: "data/ru-IE/*.parquet" - config_name: "ru-IL" data_files: - split: train path: "data/ru-IL/*.parquet" - config_name: "ru-IM" data_files: - split: train path: "data/ru-IM/*.parquet" - config_name: "ru-IN" data_files: - split: train path: "data/ru-IN/*.parquet" - config_name: "ru-IO" data_files: - split: train path: "data/ru-IO/*.parquet" - config_name: "ru-IR" data_files: - split: train path: "data/ru-IR/*.parquet" - config_name: "ru-IS" data_files: - split: train path: "data/ru-IS/*.parquet" - config_name: "ru-IT" data_files: - split: train path: "data/ru-IT/*.parquet" - config_name: "ru-JP" data_files: - split: train path: "data/ru-JP/*.parquet" - config_name: "ru-KA" data_files: - split: train path: "data/ru-KA/*.parquet" - config_name: "ru-KE" data_files: - split: train path: "data/ru-KE/*.parquet" - config_name: "ru-KG" data_files: - split: train path: "data/ru-KG/*.parquet" - config_name: "ru-KK" data_files: - split: train path: "data/ru-KK/*.parquet" - config_name: "ru-KP" data_files: - split: train path: "data/ru-KP/*.parquet" - config_name: "ru-KR" data_files: - split: train path: "data/ru-KR/*.parquet" - config_name: "ru-KY" data_files: - split: train path: "data/ru-KY/*.parquet" - config_name: "ru-KZ" data_files: - split: train path: "data/ru-KZ/*.parquet" - config_name: "ru-LA" data_files: - split: train path: "data/ru-LA/*.parquet" - config_name: "ru-LI" data_files: - split: train path: "data/ru-LI/*.parquet" - config_name: "ru-LK" data_files: - split: train path: "data/ru-LK/*.parquet" - config_name: "ru-LT" data_files: - split: train path: "data/ru-LT/*.parquet" - config_name: "ru-LU" data_files: - split: train path: "data/ru-LU/*.parquet" - config_name: "ru-LV" data_files: - split: train path: "data/ru-LV/*.parquet" - config_name: "ru-LY" data_files: - split: train path: "data/ru-LY/*.parquet" - config_name: "ru-MA" data_files: - split: train path: "data/ru-MA/*.parquet" - config_name: "ru-MD" data_files: - split: train path: "data/ru-MD/*.parquet" - config_name: "ru-ME" data_files: - split: train path: "data/ru-ME/*.parquet" - config_name: "ru-MG" data_files: - split: train path: "data/ru-MG/*.parquet" - config_name: "ru-MK" data_files: - split: train path: "data/ru-MK/*.parquet" - config_name: "ru-ML" data_files: - split: train path: "data/ru-ML/*.parquet" - config_name: "ru-MN" data_files: - split: train path: "data/ru-MN/*.parquet" - config_name: "ru-MS" data_files: - split: train path: "data/ru-MS/*.parquet" - config_name: "ru-MT" data_files: - split: train path: "data/ru-MT/*.parquet" - config_name: "ru-MU" data_files: - split: train path: "data/ru-MU/*.parquet" - config_name: "ru-MW" data_files: - split: train path: "data/ru-MW/*.parquet" - config_name: "ru-MX" data_files: - split: train path: "data/ru-MX/*.parquet" - config_name: "ru-MY" data_files: - split: train path: "data/ru-MY/*.parquet" - config_name: "ru-NG" data_files: - split: train path: "data/ru-NG/*.parquet" - config_name: "ru-NL" data_files: - split: train path: "data/ru-NL/*.parquet" - config_name: "ru-NO" data_files: - split: train path: "data/ru-NO/*.parquet" - config_name: "ru-NP" data_files: - split: train path: "data/ru-NP/*.parquet" - config_name: "ru-NU" data_files: - split: train path: "data/ru-NU/*.parquet" - config_name: "ru-NZ" data_files: - split: train path: "data/ru-NZ/*.parquet" - config_name: "ru-PE" data_files: - split: train path: "data/ru-PE/*.parquet" - config_name: "ru-PH" data_files: - split: train path: "data/ru-PH/*.parquet" - config_name: "ru-PK" data_files: - split: train path: "data/ru-PK/*.parquet" - config_name: "ru-PL" data_files: - split: train path: "data/ru-PL/*.parquet" - config_name: "ru-PM" data_files: - split: train path: "data/ru-PM/*.parquet" - config_name: "ru-PN" data_files: - split: train path: "data/ru-PN/*.parquet" - config_name: "ru-PT" data_files: - split: train path: "data/ru-PT/*.parquet" - config_name: "ru-PW" data_files: - split: train path: "data/ru-PW/*.parquet" - config_name: "ru-QA" data_files: - split: train path: "data/ru-QA/*.parquet" - config_name: "ru-QT" data_files: - split: train path: "data/ru-QT/*.parquet" - config_name: "ru-RE" data_files: - split: train path: "data/ru-RE/*.parquet" - config_name: "ru-RI" data_files: - split: train path: "data/ru-RI/*.parquet" - config_name: "ru-RO" data_files: - split: train path: "data/ru-RO/*.parquet" - config_name: "ru-RS" data_files: - split: train path: "data/ru-RS/*.parquet" - config_name: "ru-RU" data_files: - split: train path: "data/ru-RU/*.parquet" - config_name: "ru-SA" data_files: - split: train path: "data/ru-SA/*.parquet" - config_name: "ru-SC" data_files: - split: train path: "data/ru-SC/*.parquet" - config_name: "ru-SE" data_files: - split: train path: "data/ru-SE/*.parquet" - config_name: "ru-SG" data_files: - split: train path: "data/ru-SG/*.parquet" - config_name: "ru-SH" data_files: - split: train path: "data/ru-SH/*.parquet" - config_name: "ru-SI" data_files: - split: train path: "data/ru-SI/*.parquet" - config_name: "ru-SK" data_files: - split: train path: "data/ru-SK/*.parquet" - config_name: "ru-SO" data_files: - split: train path: "data/ru-SO/*.parquet" - config_name: "ru-SR" data_files: - split: train path: "data/ru-SR/*.parquet" - config_name: "ru-ST" data_files: - split: train path: "data/ru-ST/*.parquet" - config_name: "ru-SX" data_files: - split: train path: "data/ru-SX/*.parquet" - config_name: "ru-TC" data_files: - split: train path: "data/ru-TC/*.parquet" - config_name: "ru-TG" data_files: - split: train path: "data/ru-TG/*.parquet" - config_name: "ru-TH" data_files: - split: train path: "data/ru-TH/*.parquet" - config_name: "ru-TJ" data_files: - split: train path: "data/ru-TJ/*.parquet" - config_name: "ru-TK" data_files: - split: train path: "data/ru-TK/*.parquet" - config_name: "ru-TL" data_files: - split: train path: "data/ru-TL/*.parquet" - config_name: "ru-TM" data_files: - split: train path: "data/ru-TM/*.parquet" - config_name: "ru-TO" data_files: - split: train path: "data/ru-TO/*.parquet" - config_name: "ru-TR" data_files: - split: train path: "data/ru-TR/*.parquet" - config_name: "ru-TT" data_files: - split: train path: "data/ru-TT/*.parquet" - config_name: "ru-TV" data_files: - split: train path: "data/ru-TV/*.parquet" - config_name: "ru-TW" data_files: - split: train path: "data/ru-TW/*.parquet" - config_name: "ru-UA" data_files: - split: train path: "data/ru-UA/*.parquet" - config_name: "ru-UK" data_files: - split: train path: "data/ru-UK/*.parquet" - config_name: "ru-US" data_files: - split: train path: "data/ru-US/*.parquet" - config_name: "ru-UY" data_files: - split: train path: "data/ru-UY/*.parquet" - config_name: "ru-UZ" data_files: - split: train path: "data/ru-UZ/*.parquet" - config_name: "ru-VA" data_files: - split: train path: "data/ru-VA/*.parquet" - config_name: "ru-VC" data_files: - split: train path: "data/ru-VC/*.parquet" - config_name: "ru-VE" data_files: - split: train path: "data/ru-VE/*.parquet" - config_name: "ru-VG" data_files: - split: train path: "data/ru-VG/*.parquet" - config_name: "ru-VN" data_files: - split: train path: "data/ru-VN/*.parquet" - config_name: "ru-WF" data_files: - split: train path: "data/ru-WF/*.parquet" - config_name: "ru-WO" data_files: - split: train path: "data/ru-WO/*.parquet" - config_name: "ru-WS" data_files: - split: train path: "data/ru-WS/*.parquet" - config_name: "ru-XX" data_files: - split: train path: "data/ru-XX/*.parquet" - config_name: "ru-YU" data_files: - split: train path: "data/ru-YU/*.parquet" - config_name: "ru-ZA" data_files: - split: train path: "data/ru-ZA/*.parquet" - config_name: "sah-RU" data_files: - split: train path: "data/sah-RU/*.parquet" - config_name: "sah-XX" data_files: - split: train path: "data/sah-XX/*.parquet" - config_name: "scn-XX" data_files: - split: train path: "data/scn-XX/*.parquet" - config_name: "sd-ES" data_files: - split: train path: "data/sd-ES/*.parquet" - config_name: "sd-GB" data_files: - split: train path: "data/sd-GB/*.parquet" - config_name: "sd-PK" data_files: - split: train path: "data/sd-PK/*.parquet" - config_name: "sd-TV" data_files: - split: train path: "data/sd-TV/*.parquet" - config_name: "sd-US" data_files: - split: train path: "data/sd-US/*.parquet" - config_name: "sd-XX" data_files: - split: train path: "data/sd-XX/*.parquet" - config_name: "sh-BA" data_files: - split: train path: "data/sh-BA/*.parquet" - config_name: "sh-RS" data_files: - split: train path: "data/sh-RS/*.parquet" - config_name: "sh-US" data_files: - split: train path: "data/sh-US/*.parquet" - config_name: "sh-XX" data_files: - split: train path: "data/sh-XX/*.parquet" - config_name: "sh-YU" data_files: - split: train path: "data/sh-YU/*.parquet" - config_name: "si-GB" data_files: - split: train path: "data/si-GB/*.parquet" - config_name: "si-LK" data_files: - split: train path: "data/si-LK/*.parquet" - config_name: "si-US" data_files: - split: train path: "data/si-US/*.parquet" - config_name: "si-XX" data_files: - split: train path: "data/si-XX/*.parquet" - config_name: "sk-AG" data_files: - split: train path: "data/sk-AG/*.parquet" - config_name: "sk-AT" data_files: - split: train path: "data/sk-AT/*.parquet" - config_name: "sk-BM" data_files: - split: train path: "data/sk-BM/*.parquet" - config_name: "sk-CA" data_files: - split: train path: "data/sk-CA/*.parquet" - config_name: "sk-CC" data_files: - split: train path: "data/sk-CC/*.parquet" - config_name: "sk-CH" data_files: - split: train path: "data/sk-CH/*.parquet" - config_name: "sk-CN" data_files: - split: train path: "data/sk-CN/*.parquet" - config_name: "sk-CO" data_files: - split: train path: "data/sk-CO/*.parquet" - config_name: "sk-CZ" data_files: - split: train path: "data/sk-CZ/*.parquet" - config_name: "sk-DE" data_files: - split: train path: "data/sk-DE/*.parquet" - config_name: "sk-DK" data_files: - split: train path: "data/sk-DK/*.parquet" - config_name: "sk-EE" data_files: - split: train path: "data/sk-EE/*.parquet" - config_name: "sk-ES" data_files: - split: train path: "data/sk-ES/*.parquet" - config_name: "sk-EU" data_files: - split: train path: "data/sk-EU/*.parquet" - config_name: "sk-FI" data_files: - split: train path: "data/sk-FI/*.parquet" - config_name: "sk-FM" data_files: - split: train path: "data/sk-FM/*.parquet" - config_name: "sk-FR" data_files: - split: train path: "data/sk-FR/*.parquet" - config_name: "sk-GB" data_files: - split: train path: "data/sk-GB/*.parquet" - config_name: "sk-GR" data_files: - split: train path: "data/sk-GR/*.parquet" - config_name: "sk-HK" data_files: - split: train path: "data/sk-HK/*.parquet" - config_name: "sk-HR" data_files: - split: train path: "data/sk-HR/*.parquet" - config_name: "sk-HU" data_files: - split: train path: "data/sk-HU/*.parquet" - config_name: "sk-IO" data_files: - split: train path: "data/sk-IO/*.parquet" - config_name: "sk-IS" data_files: - split: train path: "data/sk-IS/*.parquet" - config_name: "sk-IT" data_files: - split: train path: "data/sk-IT/*.parquet" - config_name: "sk-LT" data_files: - split: train path: "data/sk-LT/*.parquet" - config_name: "sk-LV" data_files: - split: train path: "data/sk-LV/*.parquet" - config_name: "sk-ME" data_files: - split: train path: "data/sk-ME/*.parquet" - config_name: "sk-MK" data_files: - split: train path: "data/sk-MK/*.parquet" - config_name: "sk-NL" data_files: - split: train path: "data/sk-NL/*.parquet" - config_name: "sk-PL" data_files: - split: train path: "data/sk-PL/*.parquet" - config_name: "sk-PT" data_files: - split: train path: "data/sk-PT/*.parquet" - config_name: "sk-RO" data_files: - split: train path: "data/sk-RO/*.parquet" - config_name: "sk-RS" data_files: - split: train path: "data/sk-RS/*.parquet" - config_name: "sk-RU" data_files: - split: train path: "data/sk-RU/*.parquet" - config_name: "sk-SE" data_files: - split: train path: "data/sk-SE/*.parquet" - config_name: "sk-SG" data_files: - split: train path: "data/sk-SG/*.parquet" - config_name: "sk-SI" data_files: - split: train path: "data/sk-SI/*.parquet" - config_name: "sk-SK" data_files: - split: train path: "data/sk-SK/*.parquet" - config_name: "sk-TO" data_files: - split: train path: "data/sk-TO/*.parquet" - config_name: "sk-TV" data_files: - split: train path: "data/sk-TV/*.parquet" - config_name: "sk-UA" data_files: - split: train path: "data/sk-UA/*.parquet" - config_name: "sk-US" data_files: - split: train path: "data/sk-US/*.parquet" - config_name: "sk-VA" data_files: - split: train path: "data/sk-VA/*.parquet" - config_name: "sk-WS" data_files: - split: train path: "data/sk-WS/*.parquet" - config_name: "sk-XX" data_files: - split: train path: "data/sk-XX/*.parquet" - config_name: "sl-AT" data_files: - split: train path: "data/sl-AT/*.parquet" - config_name: "sl-BG" data_files: - split: train path: "data/sl-BG/*.parquet" - config_name: "sl-BR" data_files: - split: train path: "data/sl-BR/*.parquet" - config_name: "sl-CA" data_files: - split: train path: "data/sl-CA/*.parquet" - config_name: "sl-CC" data_files: - split: train path: "data/sl-CC/*.parquet" - config_name: "sl-CH" data_files: - split: train path: "data/sl-CH/*.parquet" - config_name: "sl-CN" data_files: - split: train path: "data/sl-CN/*.parquet" - config_name: "sl-CO" data_files: - split: train path: "data/sl-CO/*.parquet" - config_name: "sl-CZ" data_files: - split: train path: "data/sl-CZ/*.parquet" - config_name: "sl-DE" data_files: - split: train path: "data/sl-DE/*.parquet" - config_name: "sl-DK" data_files: - split: train path: "data/sl-DK/*.parquet" - config_name: "sl-EE" data_files: - split: train path: "data/sl-EE/*.parquet" - config_name: "sl-ES" data_files: - split: train path: "data/sl-ES/*.parquet" - config_name: "sl-EU" data_files: - split: train path: "data/sl-EU/*.parquet" - config_name: "sl-FI" data_files: - split: train path: "data/sl-FI/*.parquet" - config_name: "sl-FR" data_files: - split: train path: "data/sl-FR/*.parquet" - config_name: "sl-GB" data_files: - split: train path: "data/sl-GB/*.parquet" - config_name: "sl-HR" data_files: - split: train path: "data/sl-HR/*.parquet" - config_name: "sl-HU" data_files: - split: train path: "data/sl-HU/*.parquet" - config_name: "sl-IO" data_files: - split: train path: "data/sl-IO/*.parquet" - config_name: "sl-IT" data_files: - split: train path: "data/sl-IT/*.parquet" - config_name: "sl-JE" data_files: - split: train path: "data/sl-JE/*.parquet" - config_name: "sl-JP" data_files: - split: train path: "data/sl-JP/*.parquet" - config_name: "sl-KS" data_files: - split: train path: "data/sl-KS/*.parquet" - config_name: "sl-ME" data_files: - split: train path: "data/sl-ME/*.parquet" - config_name: "sl-MS" data_files: - split: train path: "data/sl-MS/*.parquet" - config_name: "sl-NL" data_files: - split: train path: "data/sl-NL/*.parquet" - config_name: "sl-NZ" data_files: - split: train path: "data/sl-NZ/*.parquet" - config_name: "sl-PL" data_files: - split: train path: "data/sl-PL/*.parquet" - config_name: "sl-PT" data_files: - split: train path: "data/sl-PT/*.parquet" - config_name: "sl-RE" data_files: - split: train path: "data/sl-RE/*.parquet" - config_name: "sl-RO" data_files: - split: train path: "data/sl-RO/*.parquet" - config_name: "sl-RS" data_files: - split: train path: "data/sl-RS/*.parquet" - config_name: "sl-RU" data_files: - split: train path: "data/sl-RU/*.parquet" - config_name: "sl-SE" data_files: - split: train path: "data/sl-SE/*.parquet" - config_name: "sl-SI" data_files: - split: train path: "data/sl-SI/*.parquet" - config_name: "sl-SK" data_files: - split: train path: "data/sl-SK/*.parquet" - config_name: "sl-SL" data_files: - split: train path: "data/sl-SL/*.parquet" - config_name: "sl-TL" data_files: - split: train path: "data/sl-TL/*.parquet" - config_name: "sl-TV" data_files: - split: train path: "data/sl-TV/*.parquet" - config_name: "sl-US" data_files: - split: train path: "data/sl-US/*.parquet" - config_name: "sl-VA" data_files: - split: train path: "data/sl-VA/*.parquet" - config_name: "sl-WS" data_files: - split: train path: "data/sl-WS/*.parquet" - config_name: "sl-XX" data_files: - split: train path: "data/sl-XX/*.parquet" - config_name: "sq-AL" data_files: - split: train path: "data/sq-AL/*.parquet" - config_name: "sq-AU" data_files: - split: train path: "data/sq-AU/*.parquet" - config_name: "sq-BE" data_files: - split: train path: "data/sq-BE/*.parquet" - config_name: "sq-BG" data_files: - split: train path: "data/sq-BG/*.parquet" - config_name: "sq-BR" data_files: - split: train path: "data/sq-BR/*.parquet" - config_name: "sq-CA" data_files: - split: train path: "data/sq-CA/*.parquet" - config_name: "sq-CC" data_files: - split: train path: "data/sq-CC/*.parquet" - config_name: "sq-CH" data_files: - split: train path: "data/sq-CH/*.parquet" - config_name: "sq-CN" data_files: - split: train path: "data/sq-CN/*.parquet" - config_name: "sq-CO" data_files: - split: train path: "data/sq-CO/*.parquet" - config_name: "sq-CZ" data_files: - split: train path: "data/sq-CZ/*.parquet" - config_name: "sq-DE" data_files: - split: train path: "data/sq-DE/*.parquet" - config_name: "sq-DK" data_files: - split: train path: "data/sq-DK/*.parquet" - config_name: "sq-EN" data_files: - split: train path: "data/sq-EN/*.parquet" - config_name: "sq-EU" data_files: - split: train path: "data/sq-EU/*.parquet" - config_name: "sq-FM" data_files: - split: train path: "data/sq-FM/*.parquet" - config_name: "sq-FR" data_files: - split: train path: "data/sq-FR/*.parquet" - config_name: "sq-GB" data_files: - split: train path: "data/sq-GB/*.parquet" - config_name: "sq-GR" data_files: - split: train path: "data/sq-GR/*.parquet" - config_name: "sq-IN" data_files: - split: train path: "data/sq-IN/*.parquet" - config_name: "sq-IR" data_files: - split: train path: "data/sq-IR/*.parquet" - config_name: "sq-IS" data_files: - split: train path: "data/sq-IS/*.parquet" - config_name: "sq-IT" data_files: - split: train path: "data/sq-IT/*.parquet" - config_name: "sq-KS" data_files: - split: train path: "data/sq-KS/*.parquet" - config_name: "sq-LT" data_files: - split: train path: "data/sq-LT/*.parquet" - config_name: "sq-ME" data_files: - split: train path: "data/sq-ME/*.parquet" - config_name: "sq-MK" data_files: - split: train path: "data/sq-MK/*.parquet" - config_name: "sq-NO" data_files: - split: train path: "data/sq-NO/*.parquet" - config_name: "sq-RS" data_files: - split: train path: "data/sq-RS/*.parquet" - config_name: "sq-RU" data_files: - split: train path: "data/sq-RU/*.parquet" - config_name: "sq-SH" data_files: - split: train path: "data/sq-SH/*.parquet" - config_name: "sq-SQ" data_files: - split: train path: "data/sq-SQ/*.parquet" - config_name: "sq-TL" data_files: - split: train path: "data/sq-TL/*.parquet" - config_name: "sq-TV" data_files: - split: train path: "data/sq-TV/*.parquet" - config_name: "sq-UA" data_files: - split: train path: "data/sq-UA/*.parquet" - config_name: "sq-US" data_files: - split: train path: "data/sq-US/*.parquet" - config_name: "sq-VA" data_files: - split: train path: "data/sq-VA/*.parquet" - config_name: "sq-XK" data_files: - split: train path: "data/sq-XK/*.parquet" - config_name: "sq-XX" data_files: - split: train path: "data/sq-XX/*.parquet" - config_name: "sr-AT" data_files: - split: train path: "data/sr-AT/*.parquet" - config_name: "sr-BA" data_files: - split: train path: "data/sr-BA/*.parquet" - config_name: "sr-BE" data_files: - split: train path: "data/sr-BE/*.parquet" - config_name: "sr-BG" data_files: - split: train path: "data/sr-BG/*.parquet" - config_name: "sr-BZ" data_files: - split: train path: "data/sr-BZ/*.parquet" - config_name: "sr-CA" data_files: - split: train path: "data/sr-CA/*.parquet" - config_name: "sr-CC" data_files: - split: train path: "data/sr-CC/*.parquet" - config_name: "sr-CH" data_files: - split: train path: "data/sr-CH/*.parquet" - config_name: "sr-CI" data_files: - split: train path: "data/sr-CI/*.parquet" - config_name: "sr-CN" data_files: - split: train path: "data/sr-CN/*.parquet" - config_name: "sr-CO" data_files: - split: train path: "data/sr-CO/*.parquet" - config_name: "sr-CR" data_files: - split: train path: "data/sr-CR/*.parquet" - config_name: "sr-CS" data_files: - split: train path: "data/sr-CS/*.parquet" - config_name: "sr-DE" data_files: - split: train path: "data/sr-DE/*.parquet" - config_name: "sr-ES" data_files: - split: train path: "data/sr-ES/*.parquet" - config_name: "sr-EU" data_files: - split: train path: "data/sr-EU/*.parquet" - config_name: "sr-FR" data_files: - split: train path: "data/sr-FR/*.parquet" - config_name: "sr-GB" data_files: - split: train path: "data/sr-GB/*.parquet" - config_name: "sr-HR" data_files: - split: train path: "data/sr-HR/*.parquet" - config_name: "sr-HU" data_files: - split: train path: "data/sr-HU/*.parquet" - config_name: "sr-IN" data_files: - split: train path: "data/sr-IN/*.parquet" - config_name: "sr-IS" data_files: - split: train path: "data/sr-IS/*.parquet" - config_name: "sr-IT" data_files: - split: train path: "data/sr-IT/*.parquet" - config_name: "sr-JP" data_files: - split: train path: "data/sr-JP/*.parquet" - config_name: "sr-LA" data_files: - split: train path: "data/sr-LA/*.parquet" - config_name: "sr-LT" data_files: - split: train path: "data/sr-LT/*.parquet" - config_name: "sr-ME" data_files: - split: train path: "data/sr-ME/*.parquet" - config_name: "sr-MK" data_files: - split: train path: "data/sr-MK/*.parquet" - config_name: "sr-NL" data_files: - split: train path: "data/sr-NL/*.parquet" - config_name: "sr-NZ" data_files: - split: train path: "data/sr-NZ/*.parquet" - config_name: "sr-PL" data_files: - split: train path: "data/sr-PL/*.parquet" - config_name: "sr-PW" data_files: - split: train path: "data/sr-PW/*.parquet" - config_name: "sr-RO" data_files: - split: train path: "data/sr-RO/*.parquet" - config_name: "sr-RS" data_files: - split: train path: "data/sr-RS/*.parquet" - config_name: "sr-RU" data_files: - split: train path: "data/sr-RU/*.parquet" - config_name: "sr-SE" data_files: - split: train path: "data/sr-SE/*.parquet" - config_name: "sr-SK" data_files: - split: train path: "data/sr-SK/*.parquet" - config_name: "sr-SR" data_files: - split: train path: "data/sr-SR/*.parquet" - config_name: "sr-TM" data_files: - split: train path: "data/sr-TM/*.parquet" - config_name: "sr-TV" data_files: - split: train path: "data/sr-TV/*.parquet" - config_name: "sr-UA" data_files: - split: train path: "data/sr-UA/*.parquet" - config_name: "sr-US" data_files: - split: train path: "data/sr-US/*.parquet" - config_name: "sr-WS" data_files: - split: train path: "data/sr-WS/*.parquet" - config_name: "sr-XX" data_files: - split: train path: "data/sr-XX/*.parquet" - config_name: "sr-YU" data_files: - split: train path: "data/sr-YU/*.parquet" - config_name: "su-ST" data_files: - split: train path: "data/su-ST/*.parquet" - config_name: "su-XX" data_files: - split: train path: "data/su-XX/*.parquet" - config_name: "sv-AD" data_files: - split: train path: "data/sv-AD/*.parquet" - config_name: "sv-AE" data_files: - split: train path: "data/sv-AE/*.parquet" - config_name: "sv-AI" data_files: - split: train path: "data/sv-AI/*.parquet" - config_name: "sv-AR" data_files: - split: train path: "data/sv-AR/*.parquet" - config_name: "sv-AS" data_files: - split: train path: "data/sv-AS/*.parquet" - config_name: "sv-AT" data_files: - split: train path: "data/sv-AT/*.parquet" - config_name: "sv-AU" data_files: - split: train path: "data/sv-AU/*.parquet" - config_name: "sv-AX" data_files: - split: train path: "data/sv-AX/*.parquet" - config_name: "sv-AZ" data_files: - split: train path: "data/sv-AZ/*.parquet" - config_name: "sv-BA" data_files: - split: train path: "data/sv-BA/*.parquet" - config_name: "sv-BE" data_files: - split: train path: "data/sv-BE/*.parquet" - config_name: "sv-BG" data_files: - split: train path: "data/sv-BG/*.parquet" - config_name: "sv-BM" data_files: - split: train path: "data/sv-BM/*.parquet" - config_name: "sv-BR" data_files: - split: train path: "data/sv-BR/*.parquet" - config_name: "sv-BZ" data_files: - split: train path: "data/sv-BZ/*.parquet" - config_name: "sv-CA" data_files: - split: train path: "data/sv-CA/*.parquet" - config_name: "sv-CC" data_files: - split: train path: "data/sv-CC/*.parquet" - config_name: "sv-CF" data_files: - split: train path: "data/sv-CF/*.parquet" - config_name: "sv-CH" data_files: - split: train path: "data/sv-CH/*.parquet" - config_name: "sv-CN" data_files: - split: train path: "data/sv-CN/*.parquet" - config_name: "sv-CO" data_files: - split: train path: "data/sv-CO/*.parquet" - config_name: "sv-CY" data_files: - split: train path: "data/sv-CY/*.parquet" - config_name: "sv-CZ" data_files: - split: train path: "data/sv-CZ/*.parquet" - config_name: "sv-DE" data_files: - split: train path: "data/sv-DE/*.parquet" - config_name: "sv-DK" data_files: - split: train path: "data/sv-DK/*.parquet" - config_name: "sv-DO" data_files: - split: train path: "data/sv-DO/*.parquet" - config_name: "sv-EE" data_files: - split: train path: "data/sv-EE/*.parquet" - config_name: "sv-EN" data_files: - split: train path: "data/sv-EN/*.parquet" - config_name: "sv-ES" data_files: - split: train path: "data/sv-ES/*.parquet" - config_name: "sv-EU" data_files: - split: train path: "data/sv-EU/*.parquet" - config_name: "sv-FI" data_files: - split: train path: "data/sv-FI/*.parquet" - config_name: "sv-FM" data_files: - split: train path: "data/sv-FM/*.parquet" - config_name: "sv-FO" data_files: - split: train path: "data/sv-FO/*.parquet" - config_name: "sv-FR" data_files: - split: train path: "data/sv-FR/*.parquet" - config_name: "sv-GA" data_files: - split: train path: "data/sv-GA/*.parquet" - config_name: "sv-GB" data_files: - split: train path: "data/sv-GB/*.parquet" - config_name: "sv-GG" data_files: - split: train path: "data/sv-GG/*.parquet" - config_name: "sv-GR" data_files: - split: train path: "data/sv-GR/*.parquet" - config_name: "sv-HR" data_files: - split: train path: "data/sv-HR/*.parquet" - config_name: "sv-HU" data_files: - split: train path: "data/sv-HU/*.parquet" - config_name: "sv-ID" data_files: - split: train path: "data/sv-ID/*.parquet" - config_name: "sv-IE" data_files: - split: train path: "data/sv-IE/*.parquet" - config_name: "sv-IL" data_files: - split: train path: "data/sv-IL/*.parquet" - config_name: "sv-IN" data_files: - split: train path: "data/sv-IN/*.parquet" - config_name: "sv-IO" data_files: - split: train path: "data/sv-IO/*.parquet" - config_name: "sv-IR" data_files: - split: train path: "data/sv-IR/*.parquet" - config_name: "sv-IS" data_files: - split: train path: "data/sv-IS/*.parquet" - config_name: "sv-IT" data_files: - split: train path: "data/sv-IT/*.parquet" - config_name: "sv-JP" data_files: - split: train path: "data/sv-JP/*.parquet" - config_name: "sv-KR" data_files: - split: train path: "data/sv-KR/*.parquet" - config_name: "sv-KW" data_files: - split: train path: "data/sv-KW/*.parquet" - config_name: "sv-KZ" data_files: - split: train path: "data/sv-KZ/*.parquet" - config_name: "sv-LA" data_files: - split: train path: "data/sv-LA/*.parquet" - config_name: "sv-LK" data_files: - split: train path: "data/sv-LK/*.parquet" - config_name: "sv-LT" data_files: - split: train path: "data/sv-LT/*.parquet" - config_name: "sv-LV" data_files: - split: train path: "data/sv-LV/*.parquet" - config_name: "sv-MD" data_files: - split: train path: "data/sv-MD/*.parquet" - config_name: "sv-ME" data_files: - split: train path: "data/sv-ME/*.parquet" - config_name: "sv-MT" data_files: - split: train path: "data/sv-MT/*.parquet" - config_name: "sv-MX" data_files: - split: train path: "data/sv-MX/*.parquet" - config_name: "sv-NG" data_files: - split: train path: "data/sv-NG/*.parquet" - config_name: "sv-NL" data_files: - split: train path: "data/sv-NL/*.parquet" - config_name: "sv-NO" data_files: - split: train path: "data/sv-NO/*.parquet" - config_name: "sv-NU" data_files: - split: train path: "data/sv-NU/*.parquet" - config_name: "sv-NZ" data_files: - split: train path: "data/sv-NZ/*.parquet" - config_name: "sv-PE" data_files: - split: train path: "data/sv-PE/*.parquet" - config_name: "sv-PL" data_files: - split: train path: "data/sv-PL/*.parquet" - config_name: "sv-PM" data_files: - split: train path: "data/sv-PM/*.parquet" - config_name: "sv-PT" data_files: - split: train path: "data/sv-PT/*.parquet" - config_name: "sv-PW" data_files: - split: train path: "data/sv-PW/*.parquet" - config_name: "sv-RE" data_files: - split: train path: "data/sv-RE/*.parquet" - config_name: "sv-RO" data_files: - split: train path: "data/sv-RO/*.parquet" - config_name: "sv-RS" data_files: - split: train path: "data/sv-RS/*.parquet" - config_name: "sv-RU" data_files: - split: train path: "data/sv-RU/*.parquet" - config_name: "sv-SA" data_files: - split: train path: "data/sv-SA/*.parquet" - config_name: "sv-SE" data_files: - split: train path: "data/sv-SE/*.parquet" - config_name: "sv-SI" data_files: - split: train path: "data/sv-SI/*.parquet" - config_name: "sv-SK" data_files: - split: train path: "data/sv-SK/*.parquet" - config_name: "sv-ST" data_files: - split: train path: "data/sv-ST/*.parquet" - config_name: "sv-SV" data_files: - split: train path: "data/sv-SV/*.parquet" - config_name: "sv-SW" data_files: - split: train path: "data/sv-SW/*.parquet" - config_name: "sv-TK" data_files: - split: train path: "data/sv-TK/*.parquet" - config_name: "sv-TO" data_files: - split: train path: "data/sv-TO/*.parquet" - config_name: "sv-TR" data_files: - split: train path: "data/sv-TR/*.parquet" - config_name: "sv-TV" data_files: - split: train path: "data/sv-TV/*.parquet" - config_name: "sv-TW" data_files: - split: train path: "data/sv-TW/*.parquet" - config_name: "sv-UA" data_files: - split: train path: "data/sv-UA/*.parquet" - config_name: "sv-US" data_files: - split: train path: "data/sv-US/*.parquet" - config_name: "sv-VA" data_files: - split: train path: "data/sv-VA/*.parquet" - config_name: "sv-VN" data_files: - split: train path: "data/sv-VN/*.parquet" - config_name: "sv-WS" data_files: - split: train path: "data/sv-WS/*.parquet" - config_name: "sv-XX" data_files: - split: train path: "data/sv-XX/*.parquet" - config_name: "sw-CN" data_files: - split: train path: "data/sw-CN/*.parquet" - config_name: "sw-FR" data_files: - split: train path: "data/sw-FR/*.parquet" - config_name: "sw-GB" data_files: - split: train path: "data/sw-GB/*.parquet" - config_name: "sw-KE" data_files: - split: train path: "data/sw-KE/*.parquet" - config_name: "sw-RU" data_files: - split: train path: "data/sw-RU/*.parquet" - config_name: "sw-TR" data_files: - split: train path: "data/sw-TR/*.parquet" - config_name: "sw-TZ" data_files: - split: train path: "data/sw-TZ/*.parquet" - config_name: "sw-US" data_files: - split: train path: "data/sw-US/*.parquet" - config_name: "sw-XX" data_files: - split: train path: "data/sw-XX/*.parquet" - config_name: "ta-CA" data_files: - split: train path: "data/ta-CA/*.parquet" - config_name: "ta-CF" data_files: - split: train path: "data/ta-CF/*.parquet" - config_name: "ta-DE" data_files: - split: train path: "data/ta-DE/*.parquet" - config_name: "ta-GB" data_files: - split: train path: "data/ta-GB/*.parquet" - config_name: "ta-IN" data_files: - split: train path: "data/ta-IN/*.parquet" - config_name: "ta-IT" data_files: - split: train path: "data/ta-IT/*.parquet" - config_name: "ta-LK" data_files: - split: train path: "data/ta-LK/*.parquet" - config_name: "ta-ME" data_files: - split: train path: "data/ta-ME/*.parquet" - config_name: "ta-RU" data_files: - split: train path: "data/ta-RU/*.parquet" - config_name: "ta-SG" data_files: - split: train path: "data/ta-SG/*.parquet" - config_name: "ta-US" data_files: - split: train path: "data/ta-US/*.parquet" - config_name: "ta-XX" data_files: - split: train path: "data/ta-XX/*.parquet" - config_name: "te-GB" data_files: - split: train path: "data/te-GB/*.parquet" - config_name: "te-IN" data_files: - split: train path: "data/te-IN/*.parquet" - config_name: "te-NL" data_files: - split: train path: "data/te-NL/*.parquet" - config_name: "te-RU" data_files: - split: train path: "data/te-RU/*.parquet" - config_name: "te-TE" data_files: - split: train path: "data/te-TE/*.parquet" - config_name: "te-US" data_files: - split: train path: "data/te-US/*.parquet" - config_name: "te-XX" data_files: - split: train path: "data/te-XX/*.parquet" - config_name: "tg-AM" data_files: - split: train path: "data/tg-AM/*.parquet" - config_name: "tg-BE" data_files: - split: train path: "data/tg-BE/*.parquet" - config_name: "tg-BR" data_files: - split: train path: "data/tg-BR/*.parquet" - config_name: "tg-CA" data_files: - split: train path: "data/tg-CA/*.parquet" - config_name: "tg-DE" data_files: - split: train path: "data/tg-DE/*.parquet" - config_name: "tg-EU" data_files: - split: train path: "data/tg-EU/*.parquet" - config_name: "tg-FR" data_files: - split: train path: "data/tg-FR/*.parquet" - config_name: "tg-GB" data_files: - split: train path: "data/tg-GB/*.parquet" - config_name: "tg-IN" data_files: - split: train path: "data/tg-IN/*.parquet" - config_name: "tg-IR" data_files: - split: train path: "data/tg-IR/*.parquet" - config_name: "tg-IS" data_files: - split: train path: "data/tg-IS/*.parquet" - config_name: "tg-KZ" data_files: - split: train path: "data/tg-KZ/*.parquet" - config_name: "tg-RU" data_files: - split: train path: "data/tg-RU/*.parquet" - config_name: "tg-TG" data_files: - split: train path: "data/tg-TG/*.parquet" - config_name: "tg-TJ" data_files: - split: train path: "data/tg-TJ/*.parquet" - config_name: "tg-TV" data_files: - split: train path: "data/tg-TV/*.parquet" - config_name: "tg-UA" data_files: - split: train path: "data/tg-UA/*.parquet" - config_name: "tg-US" data_files: - split: train path: "data/tg-US/*.parquet" - config_name: "tg-UZ" data_files: - split: train path: "data/tg-UZ/*.parquet" - config_name: "tg-XX" data_files: - split: train path: "data/tg-XX/*.parquet" - config_name: "th-AI" data_files: - split: train path: "data/th-AI/*.parquet" - config_name: "th-AU" data_files: - split: train path: "data/th-AU/*.parquet" - config_name: "th-BE" data_files: - split: train path: "data/th-BE/*.parquet" - config_name: "th-BG" data_files: - split: train path: "data/th-BG/*.parquet" - config_name: "th-BR" data_files: - split: train path: "data/th-BR/*.parquet" - config_name: "th-BZ" data_files: - split: train path: "data/th-BZ/*.parquet" - config_name: "th-CA" data_files: - split: train path: "data/th-CA/*.parquet" - config_name: "th-CC" data_files: - split: train path: "data/th-CC/*.parquet" - config_name: "th-CH" data_files: - split: train path: "data/th-CH/*.parquet" - config_name: "th-CL" data_files: - split: train path: "data/th-CL/*.parquet" - config_name: "th-CN" data_files: - split: train path: "data/th-CN/*.parquet" - config_name: "th-CO" data_files: - split: train path: "data/th-CO/*.parquet" - config_name: "th-CZ" data_files: - split: train path: "data/th-CZ/*.parquet" - config_name: "th-DE" data_files: - split: train path: "data/th-DE/*.parquet" - config_name: "th-EN" data_files: - split: train path: "data/th-EN/*.parquet" - config_name: "th-ES" data_files: - split: train path: "data/th-ES/*.parquet" - config_name: "th-EU" data_files: - split: train path: "data/th-EU/*.parquet" - config_name: "th-FM" data_files: - split: train path: "data/th-FM/*.parquet" - config_name: "th-FR" data_files: - split: train path: "data/th-FR/*.parquet" - config_name: "th-GA" data_files: - split: train path: "data/th-GA/*.parquet" - config_name: "th-GB" data_files: - split: train path: "data/th-GB/*.parquet" - config_name: "th-GG" data_files: - split: train path: "data/th-GG/*.parquet" - config_name: "th-GR" data_files: - split: train path: "data/th-GR/*.parquet" - config_name: "th-HK" data_files: - split: train path: "data/th-HK/*.parquet" - config_name: "th-HU" data_files: - split: train path: "data/th-HU/*.parquet" - config_name: "th-ID" data_files: - split: train path: "data/th-ID/*.parquet" - config_name: "th-IM" data_files: - split: train path: "data/th-IM/*.parquet" - config_name: "th-IN" data_files: - split: train path: "data/th-IN/*.parquet" - config_name: "th-IO" data_files: - split: train path: "data/th-IO/*.parquet" - config_name: "th-IR" data_files: - split: train path: "data/th-IR/*.parquet" - config_name: "th-IS" data_files: - split: train path: "data/th-IS/*.parquet" - config_name: "th-IT" data_files: - split: train path: "data/th-IT/*.parquet" - config_name: "th-JP" data_files: - split: train path: "data/th-JP/*.parquet" - config_name: "th-KR" data_files: - split: train path: "data/th-KR/*.parquet" - config_name: "th-LA" data_files: - split: train path: "data/th-LA/*.parquet" - config_name: "th-LT" data_files: - split: train path: "data/th-LT/*.parquet" - config_name: "th-LV" data_files: - split: train path: "data/th-LV/*.parquet" - config_name: "th-ME" data_files: - split: train path: "data/th-ME/*.parquet" - config_name: "th-MH" data_files: - split: train path: "data/th-MH/*.parquet" - config_name: "th-ML" data_files: - split: train path: "data/th-ML/*.parquet" - config_name: "th-MN" data_files: - split: train path: "data/th-MN/*.parquet" - config_name: "th-MX" data_files: - split: train path: "data/th-MX/*.parquet" - config_name: "th-MY" data_files: - split: train path: "data/th-MY/*.parquet" - config_name: "th-NL" data_files: - split: train path: "data/th-NL/*.parquet" - config_name: "th-NO" data_files: - split: train path: "data/th-NO/*.parquet" - config_name: "th-NZ" data_files: - split: train path: "data/th-NZ/*.parquet" - config_name: "th-PE" data_files: - split: train path: "data/th-PE/*.parquet" - config_name: "th-PL" data_files: - split: train path: "data/th-PL/*.parquet" - config_name: "th-PS" data_files: - split: train path: "data/th-PS/*.parquet" - config_name: "th-PW" data_files: - split: train path: "data/th-PW/*.parquet" - config_name: "th-RO" data_files: - split: train path: "data/th-RO/*.parquet" - config_name: "th-RU" data_files: - split: train path: "data/th-RU/*.parquet" - config_name: "th-SE" data_files: - split: train path: "data/th-SE/*.parquet" - config_name: "th-SG" data_files: - split: train path: "data/th-SG/*.parquet" - config_name: "th-SH" data_files: - split: train path: "data/th-SH/*.parquet" - config_name: "th-TA" data_files: - split: train path: "data/th-TA/*.parquet" - config_name: "th-TH" data_files: - split: train path: "data/th-TH/*.parquet" - config_name: "th-TK" data_files: - split: train path: "data/th-TK/*.parquet" - config_name: "th-TL" data_files: - split: train path: "data/th-TL/*.parquet" - config_name: "th-TR" data_files: - split: train path: "data/th-TR/*.parquet" - config_name: "th-TV" data_files: - split: train path: "data/th-TV/*.parquet" - config_name: "th-TW" data_files: - split: train path: "data/th-TW/*.parquet" - config_name: "th-US" data_files: - split: train path: "data/th-US/*.parquet" - config_name: "th-VI" data_files: - split: train path: "data/th-VI/*.parquet" - config_name: "th-VN" data_files: - split: train path: "data/th-VN/*.parquet" - config_name: "th-VU" data_files: - split: train path: "data/th-VU/*.parquet" - config_name: "th-WS" data_files: - split: train path: "data/th-WS/*.parquet" - config_name: "th-XX" data_files: - split: train path: "data/th-XX/*.parquet" - config_name: "th-YT" data_files: - split: train path: "data/th-YT/*.parquet" - config_name: "th-ZA" data_files: - split: train path: "data/th-ZA/*.parquet" - config_name: "tk-GB" data_files: - split: train path: "data/tk-GB/*.parquet" - config_name: "tk-IS" data_files: - split: train path: "data/tk-IS/*.parquet" - config_name: "tk-TK" data_files: - split: train path: "data/tk-TK/*.parquet" - config_name: "tk-TM" data_files: - split: train path: "data/tk-TM/*.parquet" - config_name: "tk-TR" data_files: - split: train path: "data/tk-TR/*.parquet" - config_name: "tk-XX" data_files: - split: train path: "data/tk-XX/*.parquet" - config_name: "tl-AI" data_files: - split: train path: "data/tl-AI/*.parquet" - config_name: "tl-BE" data_files: - split: train path: "data/tl-BE/*.parquet" - config_name: "tl-BR" data_files: - split: train path: "data/tl-BR/*.parquet" - config_name: "tl-CA" data_files: - split: train path: "data/tl-CA/*.parquet" - config_name: "tl-CF" data_files: - split: train path: "data/tl-CF/*.parquet" - config_name: "tl-CN" data_files: - split: train path: "data/tl-CN/*.parquet" - config_name: "tl-CO" data_files: - split: train path: "data/tl-CO/*.parquet" - config_name: "tl-DE" data_files: - split: train path: "data/tl-DE/*.parquet" - config_name: "tl-DK" data_files: - split: train path: "data/tl-DK/*.parquet" - config_name: "tl-ES" data_files: - split: train path: "data/tl-ES/*.parquet" - config_name: "tl-EU" data_files: - split: train path: "data/tl-EU/*.parquet" - config_name: "tl-FM" data_files: - split: train path: "data/tl-FM/*.parquet" - config_name: "tl-FR" data_files: - split: train path: "data/tl-FR/*.parquet" - config_name: "tl-GB" data_files: - split: train path: "data/tl-GB/*.parquet" - config_name: "tl-IN" data_files: - split: train path: "data/tl-IN/*.parquet" - config_name: "tl-IO" data_files: - split: train path: "data/tl-IO/*.parquet" - config_name: "tl-IS" data_files: - split: train path: "data/tl-IS/*.parquet" - config_name: "tl-IT" data_files: - split: train path: "data/tl-IT/*.parquet" - config_name: "tl-NL" data_files: - split: train path: "data/tl-NL/*.parquet" - config_name: "tl-NU" data_files: - split: train path: "data/tl-NU/*.parquet" - config_name: "tl-PH" data_files: - split: train path: "data/tl-PH/*.parquet" - config_name: "tl-PK" data_files: - split: train path: "data/tl-PK/*.parquet" - config_name: "tl-PL" data_files: - split: train path: "data/tl-PL/*.parquet" - config_name: "tl-RU" data_files: - split: train path: "data/tl-RU/*.parquet" - config_name: "tl-TK" data_files: - split: train path: "data/tl-TK/*.parquet" - config_name: "tl-TL" data_files: - split: train path: "data/tl-TL/*.parquet" - config_name: "tl-TV" data_files: - split: train path: "data/tl-TV/*.parquet" - config_name: "tl-TW" data_files: - split: train path: "data/tl-TW/*.parquet" - config_name: "tl-UA" data_files: - split: train path: "data/tl-UA/*.parquet" - config_name: "tl-US" data_files: - split: train path: "data/tl-US/*.parquet" - config_name: "tl-VN" data_files: - split: train path: "data/tl-VN/*.parquet" - config_name: "tl-XX" data_files: - split: train path: "data/tl-XX/*.parquet" - config_name: "tr-AA" data_files: - split: train path: "data/tr-AA/*.parquet" - config_name: "tr-AE" data_files: - split: train path: "data/tr-AE/*.parquet" - config_name: "tr-AI" data_files: - split: train path: "data/tr-AI/*.parquet" - config_name: "tr-AL" data_files: - split: train path: "data/tr-AL/*.parquet" - config_name: "tr-AM" data_files: - split: train path: "data/tr-AM/*.parquet" - config_name: "tr-AR" data_files: - split: train path: "data/tr-AR/*.parquet" - config_name: "tr-AT" data_files: - split: train path: "data/tr-AT/*.parquet" - config_name: "tr-AU" data_files: - split: train path: "data/tr-AU/*.parquet" - config_name: "tr-AZ" data_files: - split: train path: "data/tr-AZ/*.parquet" - config_name: "tr-BA" data_files: - split: train path: "data/tr-BA/*.parquet" - config_name: "tr-BD" data_files: - split: train path: "data/tr-BD/*.parquet" - config_name: "tr-BE" data_files: - split: train path: "data/tr-BE/*.parquet" - config_name: "tr-BG" data_files: - split: train path: "data/tr-BG/*.parquet" - config_name: "tr-BO" data_files: - split: train path: "data/tr-BO/*.parquet" - config_name: "tr-BR" data_files: - split: train path: "data/tr-BR/*.parquet" - config_name: "tr-BZ" data_files: - split: train path: "data/tr-BZ/*.parquet" - config_name: "tr-CA" data_files: - split: train path: "data/tr-CA/*.parquet" - config_name: "tr-CC" data_files: - split: train path: "data/tr-CC/*.parquet" - config_name: "tr-CF" data_files: - split: train path: "data/tr-CF/*.parquet" - config_name: "tr-CH" data_files: - split: train path: "data/tr-CH/*.parquet" - config_name: "tr-CL" data_files: - split: train path: "data/tr-CL/*.parquet" - config_name: "tr-CN" data_files: - split: train path: "data/tr-CN/*.parquet" - config_name: "tr-CO" data_files: - split: train path: "data/tr-CO/*.parquet" - config_name: "tr-CX" data_files: - split: train path: "data/tr-CX/*.parquet" - config_name: "tr-CY" data_files: - split: train path: "data/tr-CY/*.parquet" - config_name: "tr-CZ" data_files: - split: train path: "data/tr-CZ/*.parquet" - config_name: "tr-DE" data_files: - split: train path: "data/tr-DE/*.parquet" - config_name: "tr-DJ" data_files: - split: train path: "data/tr-DJ/*.parquet" - config_name: "tr-DK" data_files: - split: train path: "data/tr-DK/*.parquet" - config_name: "tr-DO" data_files: - split: train path: "data/tr-DO/*.parquet" - config_name: "tr-EE" data_files: - split: train path: "data/tr-EE/*.parquet" - config_name: "tr-EG" data_files: - split: train path: "data/tr-EG/*.parquet" - config_name: "tr-EN" data_files: - split: train path: "data/tr-EN/*.parquet" - config_name: "tr-ES" data_files: - split: train path: "data/tr-ES/*.parquet" - config_name: "tr-EU" data_files: - split: train path: "data/tr-EU/*.parquet" - config_name: "tr-FI" data_files: - split: train path: "data/tr-FI/*.parquet" - config_name: "tr-FM" data_files: - split: train path: "data/tr-FM/*.parquet" - config_name: "tr-FR" data_files: - split: train path: "data/tr-FR/*.parquet" - config_name: "tr-GA" data_files: - split: train path: "data/tr-GA/*.parquet" - config_name: "tr-GB" data_files: - split: train path: "data/tr-GB/*.parquet" - config_name: "tr-GE" data_files: - split: train path: "data/tr-GE/*.parquet" - config_name: "tr-GG" data_files: - split: train path: "data/tr-GG/*.parquet" - config_name: "tr-GQ" data_files: - split: train path: "data/tr-GQ/*.parquet" - config_name: "tr-GR" data_files: - split: train path: "data/tr-GR/*.parquet" - config_name: "tr-HR" data_files: - split: train path: "data/tr-HR/*.parquet" - config_name: "tr-HU" data_files: - split: train path: "data/tr-HU/*.parquet" - config_name: "tr-ID" data_files: - split: train path: "data/tr-ID/*.parquet" - config_name: "tr-IE" data_files: - split: train path: "data/tr-IE/*.parquet" - config_name: "tr-IL" data_files: - split: train path: "data/tr-IL/*.parquet" - config_name: "tr-IM" data_files: - split: train path: "data/tr-IM/*.parquet" - config_name: "tr-IN" data_files: - split: train path: "data/tr-IN/*.parquet" - config_name: "tr-IO" data_files: - split: train path: "data/tr-IO/*.parquet" - config_name: "tr-IQ" data_files: - split: train path: "data/tr-IQ/*.parquet" - config_name: "tr-IR" data_files: - split: train path: "data/tr-IR/*.parquet" - config_name: "tr-IT" data_files: - split: train path: "data/tr-IT/*.parquet" - config_name: "tr-JP" data_files: - split: train path: "data/tr-JP/*.parquet" - config_name: "tr-KG" data_files: - split: train path: "data/tr-KG/*.parquet" - config_name: "tr-KR" data_files: - split: train path: "data/tr-KR/*.parquet" - config_name: "tr-KZ" data_files: - split: train path: "data/tr-KZ/*.parquet" - config_name: "tr-LA" data_files: - split: train path: "data/tr-LA/*.parquet" - config_name: "tr-LI" data_files: - split: train path: "data/tr-LI/*.parquet" - config_name: "tr-LT" data_files: - split: train path: "data/tr-LT/*.parquet" - config_name: "tr-LU" data_files: - split: train path: "data/tr-LU/*.parquet" - config_name: "tr-LV" data_files: - split: train path: "data/tr-LV/*.parquet" - config_name: "tr-LY" data_files: - split: train path: "data/tr-LY/*.parquet" - config_name: "tr-MA" data_files: - split: train path: "data/tr-MA/*.parquet" - config_name: "tr-ME" data_files: - split: train path: "data/tr-ME/*.parquet" - config_name: "tr-ML" data_files: - split: train path: "data/tr-ML/*.parquet" - config_name: "tr-MX" data_files: - split: train path: "data/tr-MX/*.parquet" - config_name: "tr-NL" data_files: - split: train path: "data/tr-NL/*.parquet" - config_name: "tr-NO" data_files: - split: train path: "data/tr-NO/*.parquet" - config_name: "tr-NU" data_files: - split: train path: "data/tr-NU/*.parquet" - config_name: "tr-PE" data_files: - split: train path: "data/tr-PE/*.parquet" - config_name: "tr-PL" data_files: - split: train path: "data/tr-PL/*.parquet" - config_name: "tr-PT" data_files: - split: train path: "data/tr-PT/*.parquet" - config_name: "tr-PW" data_files: - split: train path: "data/tr-PW/*.parquet" - config_name: "tr-RO" data_files: - split: train path: "data/tr-RO/*.parquet" - config_name: "tr-RS" data_files: - split: train path: "data/tr-RS/*.parquet" - config_name: "tr-RU" data_files: - split: train path: "data/tr-RU/*.parquet" - config_name: "tr-SA" data_files: - split: train path: "data/tr-SA/*.parquet" - config_name: "tr-SE" data_files: - split: train path: "data/tr-SE/*.parquet" - config_name: "tr-SG" data_files: - split: train path: "data/tr-SG/*.parquet" - config_name: "tr-SI" data_files: - split: train path: "data/tr-SI/*.parquet" - config_name: "tr-SK" data_files: - split: train path: "data/tr-SK/*.parquet" - config_name: "tr-ST" data_files: - split: train path: "data/tr-ST/*.parquet" - config_name: "tr-TC" data_files: - split: train path: "data/tr-TC/*.parquet" - config_name: "tr-TH" data_files: - split: train path: "data/tr-TH/*.parquet" - config_name: "tr-TK" data_files: - split: train path: "data/tr-TK/*.parquet" - config_name: "tr-TM" data_files: - split: train path: "data/tr-TM/*.parquet" - config_name: "tr-TR" data_files: - split: train path: "data/tr-TR/*.parquet" - config_name: "tr-TV" data_files: - split: train path: "data/tr-TV/*.parquet" - config_name: "tr-TW" data_files: - split: train path: "data/tr-TW/*.parquet" - config_name: "tr-UA" data_files: - split: train path: "data/tr-UA/*.parquet" - config_name: "tr-US" data_files: - split: train path: "data/tr-US/*.parquet" - config_name: "tr-UY" data_files: - split: train path: "data/tr-UY/*.parquet" - config_name: "tr-VE" data_files: - split: train path: "data/tr-VE/*.parquet" - config_name: "tr-VN" data_files: - split: train path: "data/tr-VN/*.parquet" - config_name: "tr-WS" data_files: - split: train path: "data/tr-WS/*.parquet" - config_name: "tr-XX" data_files: - split: train path: "data/tr-XX/*.parquet" - config_name: "tr-ZA" data_files: - split: train path: "data/tr-ZA/*.parquet" - config_name: "tt-RU" data_files: - split: train path: "data/tt-RU/*.parquet" - config_name: "tt-TR" data_files: - split: train path: "data/tt-TR/*.parquet" - config_name: "tt-TT" data_files: - split: train path: "data/tt-TT/*.parquet" - config_name: "tt-XX" data_files: - split: train path: "data/tt-XX/*.parquet" - config_name: "ug-CN" data_files: - split: train path: "data/ug-CN/*.parquet" - config_name: "ug-HK" data_files: - split: train path: "data/ug-HK/*.parquet" - config_name: "ug-KZ" data_files: - split: train path: "data/ug-KZ/*.parquet" - config_name: "ug-NO" data_files: - split: train path: "data/ug-NO/*.parquet" - config_name: "ug-RU" data_files: - split: train path: "data/ug-RU/*.parquet" - config_name: "ug-US" data_files: - split: train path: "data/ug-US/*.parquet" - config_name: "ug-XX" data_files: - split: train path: "data/ug-XX/*.parquet" - config_name: "uk-AE" data_files: - split: train path: "data/uk-AE/*.parquet" - config_name: "uk-AM" data_files: - split: train path: "data/uk-AM/*.parquet" - config_name: "uk-AT" data_files: - split: train path: "data/uk-AT/*.parquet" - config_name: "uk-AW" data_files: - split: train path: "data/uk-AW/*.parquet" - config_name: "uk-BG" data_files: - split: train path: "data/uk-BG/*.parquet" - config_name: "uk-BR" data_files: - split: train path: "data/uk-BR/*.parquet" - config_name: "uk-BY" data_files: - split: train path: "data/uk-BY/*.parquet" - config_name: "uk-CA" data_files: - split: train path: "data/uk-CA/*.parquet" - config_name: "uk-CC" data_files: - split: train path: "data/uk-CC/*.parquet" - config_name: "uk-CD" data_files: - split: train path: "data/uk-CD/*.parquet" - config_name: "uk-CF" data_files: - split: train path: "data/uk-CF/*.parquet" - config_name: "uk-CH" data_files: - split: train path: "data/uk-CH/*.parquet" - config_name: "uk-CN" data_files: - split: train path: "data/uk-CN/*.parquet" - config_name: "uk-CO" data_files: - split: train path: "data/uk-CO/*.parquet" - config_name: "uk-CZ" data_files: - split: train path: "data/uk-CZ/*.parquet" - config_name: "uk-DE" data_files: - split: train path: "data/uk-DE/*.parquet" - config_name: "uk-DK" data_files: - split: train path: "data/uk-DK/*.parquet" - config_name: "uk-EE" data_files: - split: train path: "data/uk-EE/*.parquet" - config_name: "uk-EN" data_files: - split: train path: "data/uk-EN/*.parquet" - config_name: "uk-ES" data_files: - split: train path: "data/uk-ES/*.parquet" - config_name: "uk-EU" data_files: - split: train path: "data/uk-EU/*.parquet" - config_name: "uk-FI" data_files: - split: train path: "data/uk-FI/*.parquet" - config_name: "uk-FM" data_files: - split: train path: "data/uk-FM/*.parquet" - config_name: "uk-FR" data_files: - split: train path: "data/uk-FR/*.parquet" - config_name: "uk-GA" data_files: - split: train path: "data/uk-GA/*.parquet" - config_name: "uk-GB" data_files: - split: train path: "data/uk-GB/*.parquet" - config_name: "uk-GE" data_files: - split: train path: "data/uk-GE/*.parquet" - config_name: "uk-GG" data_files: - split: train path: "data/uk-GG/*.parquet" - config_name: "uk-GQ" data_files: - split: train path: "data/uk-GQ/*.parquet" - config_name: "uk-GR" data_files: - split: train path: "data/uk-GR/*.parquet" - config_name: "uk-HR" data_files: - split: train path: "data/uk-HR/*.parquet" - config_name: "uk-HT" data_files: - split: train path: "data/uk-HT/*.parquet" - config_name: "uk-HU" data_files: - split: train path: "data/uk-HU/*.parquet" - config_name: "uk-ID" data_files: - split: train path: "data/uk-ID/*.parquet" - config_name: "uk-IE" data_files: - split: train path: "data/uk-IE/*.parquet" - config_name: "uk-IL" data_files: - split: train path: "data/uk-IL/*.parquet" - config_name: "uk-IN" data_files: - split: train path: "data/uk-IN/*.parquet" - config_name: "uk-IO" data_files: - split: train path: "data/uk-IO/*.parquet" - config_name: "uk-IS" data_files: - split: train path: "data/uk-IS/*.parquet" - config_name: "uk-IT" data_files: - split: train path: "data/uk-IT/*.parquet" - config_name: "uk-KG" data_files: - split: train path: "data/uk-KG/*.parquet" - config_name: "uk-KR" data_files: - split: train path: "data/uk-KR/*.parquet" - config_name: "uk-KZ" data_files: - split: train path: "data/uk-KZ/*.parquet" - config_name: "uk-LI" data_files: - split: train path: "data/uk-LI/*.parquet" - config_name: "uk-LM" data_files: - split: train path: "data/uk-LM/*.parquet" - config_name: "uk-LT" data_files: - split: train path: "data/uk-LT/*.parquet" - config_name: "uk-LV" data_files: - split: train path: "data/uk-LV/*.parquet" - config_name: "uk-MA" data_files: - split: train path: "data/uk-MA/*.parquet" - config_name: "uk-MD" data_files: - split: train path: "data/uk-MD/*.parquet" - config_name: "uk-ME" data_files: - split: train path: "data/uk-ME/*.parquet" - config_name: "uk-ML" data_files: - split: train path: "data/uk-ML/*.parquet" - config_name: "uk-MX" data_files: - split: train path: "data/uk-MX/*.parquet" - config_name: "uk-NL" data_files: - split: train path: "data/uk-NL/*.parquet" - config_name: "uk-PL" data_files: - split: train path: "data/uk-PL/*.parquet" - config_name: "uk-PM" data_files: - split: train path: "data/uk-PM/*.parquet" - config_name: "uk-PN" data_files: - split: train path: "data/uk-PN/*.parquet" - config_name: "uk-PT" data_files: - split: train path: "data/uk-PT/*.parquet" - config_name: "uk-RO" data_files: - split: train path: "data/uk-RO/*.parquet" - config_name: "uk-RS" data_files: - split: train path: "data/uk-RS/*.parquet" - config_name: "uk-RU" data_files: - split: train path: "data/uk-RU/*.parquet" - config_name: "uk-SE" data_files: - split: train path: "data/uk-SE/*.parquet" - config_name: "uk-SK" data_files: - split: train path: "data/uk-SK/*.parquet" - config_name: "uk-TH" data_files: - split: train path: "data/uk-TH/*.parquet" - config_name: "uk-TK" data_files: - split: train path: "data/uk-TK/*.parquet" - config_name: "uk-TO" data_files: - split: train path: "data/uk-TO/*.parquet" - config_name: "uk-TR" data_files: - split: train path: "data/uk-TR/*.parquet" - config_name: "uk-TV" data_files: - split: train path: "data/uk-TV/*.parquet" - config_name: "uk-TW" data_files: - split: train path: "data/uk-TW/*.parquet" - config_name: "uk-UA" data_files: - split: train path: "data/uk-UA/*.parquet" - config_name: "uk-UK" data_files: - split: train path: "data/uk-UK/*.parquet" - config_name: "uk-US" data_files: - split: train path: "data/uk-US/*.parquet" - config_name: "uk-UY" data_files: - split: train path: "data/uk-UY/*.parquet" - config_name: "uk-UZ" data_files: - split: train path: "data/uk-UZ/*.parquet" - config_name: "uk-VA" data_files: - split: train path: "data/uk-VA/*.parquet" - config_name: "uk-VC" data_files: - split: train path: "data/uk-VC/*.parquet" - config_name: "uk-VG" data_files: - split: train path: "data/uk-VG/*.parquet" - config_name: "uk-WS" data_files: - split: train path: "data/uk-WS/*.parquet" - config_name: "uk-XX" data_files: - split: train path: "data/uk-XX/*.parquet" - config_name: "uk-ZA" data_files: - split: train path: "data/uk-ZA/*.parquet" - config_name: "ur-AA" data_files: - split: train path: "data/ur-AA/*.parquet" - config_name: "ur-AE" data_files: - split: train path: "data/ur-AE/*.parquet" - config_name: "ur-AU" data_files: - split: train path: "data/ur-AU/*.parquet" - config_name: "ur-CA" data_files: - split: train path: "data/ur-CA/*.parquet" - config_name: "ur-CN" data_files: - split: train path: "data/ur-CN/*.parquet" - config_name: "ur-CO" data_files: - split: train path: "data/ur-CO/*.parquet" - config_name: "ur-DE" data_files: - split: train path: "data/ur-DE/*.parquet" - config_name: "ur-ES" data_files: - split: train path: "data/ur-ES/*.parquet" - config_name: "ur-EU" data_files: - split: train path: "data/ur-EU/*.parquet" - config_name: "ur-GB" data_files: - split: train path: "data/ur-GB/*.parquet" - config_name: "ur-IN" data_files: - split: train path: "data/ur-IN/*.parquet" - config_name: "ur-IQ" data_files: - split: train path: "data/ur-IQ/*.parquet" - config_name: "ur-IR" data_files: - split: train path: "data/ur-IR/*.parquet" - config_name: "ur-LB" data_files: - split: train path: "data/ur-LB/*.parquet" - config_name: "ur-LT" data_files: - split: train path: "data/ur-LT/*.parquet" - config_name: "ur-NL" data_files: - split: train path: "data/ur-NL/*.parquet" - config_name: "ur-NO" data_files: - split: train path: "data/ur-NO/*.parquet" - config_name: "ur-PK" data_files: - split: train path: "data/ur-PK/*.parquet" - config_name: "ur-PL" data_files: - split: train path: "data/ur-PL/*.parquet" - config_name: "ur-PT" data_files: - split: train path: "data/ur-PT/*.parquet" - config_name: "ur-RU" data_files: - split: train path: "data/ur-RU/*.parquet" - config_name: "ur-SA" data_files: - split: train path: "data/ur-SA/*.parquet" - config_name: "ur-TK" data_files: - split: train path: "data/ur-TK/*.parquet" - config_name: "ur-TR" data_files: - split: train path: "data/ur-TR/*.parquet" - config_name: "ur-TV" data_files: - split: train path: "data/ur-TV/*.parquet" - config_name: "ur-UR" data_files: - split: train path: "data/ur-UR/*.parquet" - config_name: "ur-US" data_files: - split: train path: "data/ur-US/*.parquet" - config_name: "ur-UZ" data_files: - split: train path: "data/ur-UZ/*.parquet" - config_name: "ur-XX" data_files: - split: train path: "data/ur-XX/*.parquet" - config_name: "uz-RU" data_files: - split: train path: "data/uz-RU/*.parquet" - config_name: "uz-TR" data_files: - split: train path: "data/uz-TR/*.parquet" - config_name: "uz-US" data_files: - split: train path: "data/uz-US/*.parquet" - config_name: "uz-UZ" data_files: - split: train path: "data/uz-UZ/*.parquet" - config_name: "uz-XX" data_files: - split: train path: "data/uz-XX/*.parquet" - config_name: "vi-AD" data_files: - split: train path: "data/vi-AD/*.parquet" - config_name: "vi-AE" data_files: - split: train path: "data/vi-AE/*.parquet" - config_name: "vi-AG" data_files: - split: train path: "data/vi-AG/*.parquet" - config_name: "vi-AI" data_files: - split: train path: "data/vi-AI/*.parquet" - config_name: "vi-AM" data_files: - split: train path: "data/vi-AM/*.parquet" - config_name: "vi-AR" data_files: - split: train path: "data/vi-AR/*.parquet" - config_name: "vi-AU" data_files: - split: train path: "data/vi-AU/*.parquet" - config_name: "vi-BE" data_files: - split: train path: "data/vi-BE/*.parquet" - config_name: "vi-BR" data_files: - split: train path: "data/vi-BR/*.parquet" - config_name: "vi-BZ" data_files: - split: train path: "data/vi-BZ/*.parquet" - config_name: "vi-CA" data_files: - split: train path: "data/vi-CA/*.parquet" - config_name: "vi-CC" data_files: - split: train path: "data/vi-CC/*.parquet" - config_name: "vi-CF" data_files: - split: train path: "data/vi-CF/*.parquet" - config_name: "vi-CH" data_files: - split: train path: "data/vi-CH/*.parquet" - config_name: "vi-CN" data_files: - split: train path: "data/vi-CN/*.parquet" - config_name: "vi-CO" data_files: - split: train path: "data/vi-CO/*.parquet" - config_name: "vi-CX" data_files: - split: train path: "data/vi-CX/*.parquet" - config_name: "vi-CY" data_files: - split: train path: "data/vi-CY/*.parquet" - config_name: "vi-CZ" data_files: - split: train path: "data/vi-CZ/*.parquet" - config_name: "vi-DE" data_files: - split: train path: "data/vi-DE/*.parquet" - config_name: "vi-DK" data_files: - split: train path: "data/vi-DK/*.parquet" - config_name: "vi-DO" data_files: - split: train path: "data/vi-DO/*.parquet" - config_name: "vi-EC" data_files: - split: train path: "data/vi-EC/*.parquet" - config_name: "vi-EN" data_files: - split: train path: "data/vi-EN/*.parquet" - config_name: "vi-ES" data_files: - split: train path: "data/vi-ES/*.parquet" - config_name: "vi-EU" data_files: - split: train path: "data/vi-EU/*.parquet" - config_name: "vi-FI" data_files: - split: train path: "data/vi-FI/*.parquet" - config_name: "vi-FM" data_files: - split: train path: "data/vi-FM/*.parquet" - config_name: "vi-FR" data_files: - split: train path: "data/vi-FR/*.parquet" - config_name: "vi-GA" data_files: - split: train path: "data/vi-GA/*.parquet" - config_name: "vi-GB" data_files: - split: train path: "data/vi-GB/*.parquet" - config_name: "vi-GG" data_files: - split: train path: "data/vi-GG/*.parquet" - config_name: "vi-GO" data_files: - split: train path: "data/vi-GO/*.parquet" - config_name: "vi-GQ" data_files: - split: train path: "data/vi-GQ/*.parquet" - config_name: "vi-GR" data_files: - split: train path: "data/vi-GR/*.parquet" - config_name: "vi-HK" data_files: - split: train path: "data/vi-HK/*.parquet" - config_name: "vi-HU" data_files: - split: train path: "data/vi-HU/*.parquet" - config_name: "vi-ID" data_files: - split: train path: "data/vi-ID/*.parquet" - config_name: "vi-IE" data_files: - split: train path: "data/vi-IE/*.parquet" - config_name: "vi-IL" data_files: - split: train path: "data/vi-IL/*.parquet" - config_name: "vi-IM" data_files: - split: train path: "data/vi-IM/*.parquet" - config_name: "vi-IN" data_files: - split: train path: "data/vi-IN/*.parquet" - config_name: "vi-IO" data_files: - split: train path: "data/vi-IO/*.parquet" - config_name: "vi-IR" data_files: - split: train path: "data/vi-IR/*.parquet" - config_name: "vi-IS" data_files: - split: train path: "data/vi-IS/*.parquet" - config_name: "vi-IT" data_files: - split: train path: "data/vi-IT/*.parquet" - config_name: "vi-JP" data_files: - split: train path: "data/vi-JP/*.parquet" - config_name: "vi-KR" data_files: - split: train path: "data/vi-KR/*.parquet" - config_name: "vi-KY" data_files: - split: train path: "data/vi-KY/*.parquet" - config_name: "vi-LA" data_files: - split: train path: "data/vi-LA/*.parquet" - config_name: "vi-LI" data_files: - split: train path: "data/vi-LI/*.parquet" - config_name: "vi-LT" data_files: - split: train path: "data/vi-LT/*.parquet" - config_name: "vi-LU" data_files: - split: train path: "data/vi-LU/*.parquet" - config_name: "vi-LY" data_files: - split: train path: "data/vi-LY/*.parquet" - config_name: "vi-ME" data_files: - split: train path: "data/vi-ME/*.parquet" - config_name: "vi-ML" data_files: - split: train path: "data/vi-ML/*.parquet" - config_name: "vi-MM" data_files: - split: train path: "data/vi-MM/*.parquet" - config_name: "vi-MX" data_files: - split: train path: "data/vi-MX/*.parquet" - config_name: "vi-MY" data_files: - split: train path: "data/vi-MY/*.parquet" - config_name: "vi-NG" data_files: - split: train path: "data/vi-NG/*.parquet" - config_name: "vi-NL" data_files: - split: train path: "data/vi-NL/*.parquet" - config_name: "vi-NO" data_files: - split: train path: "data/vi-NO/*.parquet" - config_name: "vi-NZ" data_files: - split: train path: "data/vi-NZ/*.parquet" - config_name: "vi-PE" data_files: - split: train path: "data/vi-PE/*.parquet" - config_name: "vi-PH" data_files: - split: train path: "data/vi-PH/*.parquet" - config_name: "vi-PL" data_files: - split: train path: "data/vi-PL/*.parquet" - config_name: "vi-PT" data_files: - split: train path: "data/vi-PT/*.parquet" - config_name: "vi-PW" data_files: - split: train path: "data/vi-PW/*.parquet" - config_name: "vi-RE" data_files: - split: train path: "data/vi-RE/*.parquet" - config_name: "vi-RO" data_files: - split: train path: "data/vi-RO/*.parquet" - config_name: "vi-RU" data_files: - split: train path: "data/vi-RU/*.parquet" - config_name: "vi-SA" data_files: - split: train path: "data/vi-SA/*.parquet" - config_name: "vi-SE" data_files: - split: train path: "data/vi-SE/*.parquet" - config_name: "vi-SG" data_files: - split: train path: "data/vi-SG/*.parquet" - config_name: "vi-SH" data_files: - split: train path: "data/vi-SH/*.parquet" - config_name: "vi-SK" data_files: - split: train path: "data/vi-SK/*.parquet" - config_name: "vi-SO" data_files: - split: train path: "data/vi-SO/*.parquet" - config_name: "vi-ST" data_files: - split: train path: "data/vi-ST/*.parquet" - config_name: "vi-TH" data_files: - split: train path: "data/vi-TH/*.parquet" - config_name: "vi-TK" data_files: - split: train path: "data/vi-TK/*.parquet" - config_name: "vi-TL" data_files: - split: train path: "data/vi-TL/*.parquet" - config_name: "vi-TO" data_files: - split: train path: "data/vi-TO/*.parquet" - config_name: "vi-TR" data_files: - split: train path: "data/vi-TR/*.parquet" - config_name: "vi-TV" data_files: - split: train path: "data/vi-TV/*.parquet" - config_name: "vi-TW" data_files: - split: train path: "data/vi-TW/*.parquet" - config_name: "vi-UA" data_files: - split: train path: "data/vi-UA/*.parquet" - config_name: "vi-UK" data_files: - split: train path: "data/vi-UK/*.parquet" - config_name: "vi-US" data_files: - split: train path: "data/vi-US/*.parquet" - config_name: "vi-VA" data_files: - split: train path: "data/vi-VA/*.parquet" - config_name: "vi-VC" data_files: - split: train path: "data/vi-VC/*.parquet" - config_name: "vi-VE" data_files: - split: train path: "data/vi-VE/*.parquet" - config_name: "vi-VI" data_files: - split: train path: "data/vi-VI/*.parquet" - config_name: "vi-VN" data_files: - split: train path: "data/vi-VN/*.parquet" - config_name: "vi-WS" data_files: - split: train path: "data/vi-WS/*.parquet" - config_name: "vi-XX" data_files: - split: train path: "data/vi-XX/*.parquet" - config_name: "vi-ZA" data_files: - split: train path: "data/vi-ZA/*.parquet" - config_name: "vo-XX" data_files: - split: train path: "data/vo-XX/*.parquet" - config_name: "wa-XX" data_files: - split: train path: "data/wa-XX/*.parquet" - config_name: "war-IS" data_files: - split: train path: "data/war-IS/*.parquet" - config_name: "war-XX" data_files: - split: train path: "data/war-XX/*.parquet" - config_name: "xal-IS" data_files: - split: train path: "data/xal-IS/*.parquet" - config_name: "xmf-XX" data_files: - split: train path: "data/xmf-XX/*.parquet" - config_name: "yi-CO" data_files: - split: train path: "data/yi-CO/*.parquet" - config_name: "yi-DE" data_files: - split: train path: "data/yi-DE/*.parquet" - config_name: "yi-IL" data_files: - split: train path: "data/yi-IL/*.parquet" - config_name: "yi-JP" data_files: - split: train path: "data/yi-JP/*.parquet" - config_name: "yi-RU" data_files: - split: train path: "data/yi-RU/*.parquet" - config_name: "yi-US" data_files: - split: train path: "data/yi-US/*.parquet" - config_name: "yi-XX" data_files: - split: train path: "data/yi-XX/*.parquet" - config_name: "yo-NG" data_files: - split: train path: "data/yo-NG/*.parquet" - config_name: "yo-XX" data_files: - split: train path: "data/yo-XX/*.parquet" - config_name: "zh-AE" data_files: - split: train path: "data/zh-AE/*.parquet" - config_name: "zh-AI" data_files: - split: train path: "data/zh-AI/*.parquet" - config_name: "zh-AL" data_files: - split: train path: "data/zh-AL/*.parquet" - config_name: "zh-AM" data_files: - split: train path: "data/zh-AM/*.parquet" - config_name: "zh-AR" data_files: - split: train path: "data/zh-AR/*.parquet" - config_name: "zh-AT" data_files: - split: train path: "data/zh-AT/*.parquet" - config_name: "zh-AU" data_files: - split: train path: "data/zh-AU/*.parquet" - config_name: "zh-BD" data_files: - split: train path: "data/zh-BD/*.parquet" - config_name: "zh-BR" data_files: - split: train path: "data/zh-BR/*.parquet" - config_name: "zh-BY" data_files: - split: train path: "data/zh-BY/*.parquet" - config_name: "zh-BZ" data_files: - split: train path: "data/zh-BZ/*.parquet" - config_name: "zh-CA" data_files: - split: train path: "data/zh-CA/*.parquet" - config_name: "zh-CC" data_files: - split: train path: "data/zh-CC/*.parquet" - config_name: "zh-CD" data_files: - split: train path: "data/zh-CD/*.parquet" - config_name: "zh-CF" data_files: - split: train path: "data/zh-CF/*.parquet" - config_name: "zh-CH" data_files: - split: train path: "data/zh-CH/*.parquet" - config_name: "zh-CI" data_files: - split: train path: "data/zh-CI/*.parquet" - config_name: "zh-CL" data_files: - split: train path: "data/zh-CL/*.parquet" - config_name: "zh-CM" data_files: - split: train path: "data/zh-CM/*.parquet" - config_name: "zh-CN" data_files: - split: train path: "data/zh-CN/*.parquet" - config_name: "zh-CO" data_files: - split: train path: "data/zh-CO/*.parquet" - config_name: "zh-CX" data_files: - split: train path: "data/zh-CX/*.parquet" - config_name: "zh-CY" data_files: - split: train path: "data/zh-CY/*.parquet" - config_name: "zh-CZ" data_files: - split: train path: "data/zh-CZ/*.parquet" - config_name: "zh-DE" data_files: - split: train path: "data/zh-DE/*.parquet" - config_name: "zh-DJ" data_files: - split: train path: "data/zh-DJ/*.parquet" - config_name: "zh-DK" data_files: - split: train path: "data/zh-DK/*.parquet" - config_name: "zh-DO" data_files: - split: train path: "data/zh-DO/*.parquet" - config_name: "zh-EE" data_files: - split: train path: "data/zh-EE/*.parquet" - config_name: "zh-EN" data_files: - split: train path: "data/zh-EN/*.parquet" - config_name: "zh-ES" data_files: - split: train path: "data/zh-ES/*.parquet" - config_name: "zh-EU" data_files: - split: train path: "data/zh-EU/*.parquet" - config_name: "zh-FI" data_files: - split: train path: "data/zh-FI/*.parquet" - config_name: "zh-FM" data_files: - split: train path: "data/zh-FM/*.parquet" - config_name: "zh-FR" data_files: - split: train path: "data/zh-FR/*.parquet" - config_name: "zh-GA" data_files: - split: train path: "data/zh-GA/*.parquet" - config_name: "zh-GB" data_files: - split: train path: "data/zh-GB/*.parquet" - config_name: "zh-GE" data_files: - split: train path: "data/zh-GE/*.parquet" - config_name: "zh-GG" data_files: - split: train path: "data/zh-GG/*.parquet" - config_name: "zh-GL" data_files: - split: train path: "data/zh-GL/*.parquet" - config_name: "zh-GQ" data_files: - split: train path: "data/zh-GQ/*.parquet" - config_name: "zh-GR" data_files: - split: train path: "data/zh-GR/*.parquet" - config_name: "zh-HK" data_files: - split: train path: "data/zh-HK/*.parquet" - config_name: "zh-HN" data_files: - split: train path: "data/zh-HN/*.parquet" - config_name: "zh-HR" data_files: - split: train path: "data/zh-HR/*.parquet" - config_name: "zh-HU" data_files: - split: train path: "data/zh-HU/*.parquet" - config_name: "zh-ID" data_files: - split: train path: "data/zh-ID/*.parquet" - config_name: "zh-IE" data_files: - split: train path: "data/zh-IE/*.parquet" - config_name: "zh-IL" data_files: - split: train path: "data/zh-IL/*.parquet" - config_name: "zh-IM" data_files: - split: train path: "data/zh-IM/*.parquet" - config_name: "zh-IN" data_files: - split: train path: "data/zh-IN/*.parquet" - config_name: "zh-IO" data_files: - split: train path: "data/zh-IO/*.parquet" - config_name: "zh-IR" data_files: - split: train path: "data/zh-IR/*.parquet" - config_name: "zh-IS" data_files: - split: train path: "data/zh-IS/*.parquet" - config_name: "zh-IT" data_files: - split: train path: "data/zh-IT/*.parquet" - config_name: "zh-JP" data_files: - split: train path: "data/zh-JP/*.parquet" - config_name: "zh-KE" data_files: - split: train path: "data/zh-KE/*.parquet" - config_name: "zh-KP" data_files: - split: train path: "data/zh-KP/*.parquet" - config_name: "zh-KR" data_files: - split: train path: "data/zh-KR/*.parquet" - config_name: "zh-KZ" data_files: - split: train path: "data/zh-KZ/*.parquet" - config_name: "zh-LA" data_files: - split: train path: "data/zh-LA/*.parquet" - config_name: "zh-LC" data_files: - split: train path: "data/zh-LC/*.parquet" - config_name: "zh-LI" data_files: - split: train path: "data/zh-LI/*.parquet" - config_name: "zh-LT" data_files: - split: train path: "data/zh-LT/*.parquet" - config_name: "zh-LU" data_files: - split: train path: "data/zh-LU/*.parquet" - config_name: "zh-LY" data_files: - split: train path: "data/zh-LY/*.parquet" - config_name: "zh-MA" data_files: - split: train path: "data/zh-MA/*.parquet" - config_name: "zh-ME" data_files: - split: train path: "data/zh-ME/*.parquet" - config_name: "zh-MG" data_files: - split: train path: "data/zh-MG/*.parquet" - config_name: "zh-MK" data_files: - split: train path: "data/zh-MK/*.parquet" - config_name: "zh-ML" data_files: - split: train path: "data/zh-ML/*.parquet" - config_name: "zh-MM" data_files: - split: train path: "data/zh-MM/*.parquet" - config_name: "zh-MN" data_files: - split: train path: "data/zh-MN/*.parquet" - config_name: "zh-MO" data_files: - split: train path: "data/zh-MO/*.parquet" - config_name: "zh-MU" data_files: - split: train path: "data/zh-MU/*.parquet" - config_name: "zh-MX" data_files: - split: train path: "data/zh-MX/*.parquet" - config_name: "zh-MY" data_files: - split: train path: "data/zh-MY/*.parquet" - config_name: "zh-NG" data_files: - split: train path: "data/zh-NG/*.parquet" - config_name: "zh-NL" data_files: - split: train path: "data/zh-NL/*.parquet" - config_name: "zh-NO" data_files: - split: train path: "data/zh-NO/*.parquet" - config_name: "zh-NZ" data_files: - split: train path: "data/zh-NZ/*.parquet" - config_name: "zh-PE" data_files: - split: train path: "data/zh-PE/*.parquet" - config_name: "zh-PH" data_files: - split: train path: "data/zh-PH/*.parquet" - config_name: "zh-PL" data_files: - split: train path: "data/zh-PL/*.parquet" - config_name: "zh-PM" data_files: - split: train path: "data/zh-PM/*.parquet" - config_name: "zh-PT" data_files: - split: train path: "data/zh-PT/*.parquet" - config_name: "zh-PW" data_files: - split: train path: "data/zh-PW/*.parquet" - config_name: "zh-QA" data_files: - split: train path: "data/zh-QA/*.parquet" - config_name: "zh-RO" data_files: - split: train path: "data/zh-RO/*.parquet" - config_name: "zh-RU" data_files: - split: train path: "data/zh-RU/*.parquet" - config_name: "zh-SB" data_files: - split: train path: "data/zh-SB/*.parquet" - config_name: "zh-SC" data_files: - split: train path: "data/zh-SC/*.parquet" - config_name: "zh-SE" data_files: - split: train path: "data/zh-SE/*.parquet" - config_name: "zh-SG" data_files: - split: train path: "data/zh-SG/*.parquet" - config_name: "zh-SH" data_files: - split: train path: "data/zh-SH/*.parquet" - config_name: "zh-SK" data_files: - split: train path: "data/zh-SK/*.parquet" - config_name: "zh-SO" data_files: - split: train path: "data/zh-SO/*.parquet" - config_name: "zh-TC" data_files: - split: train path: "data/zh-TC/*.parquet" - config_name: "zh-TH" data_files: - split: train path: "data/zh-TH/*.parquet" - config_name: "zh-TK" data_files: - split: train path: "data/zh-TK/*.parquet" - config_name: "zh-TM" data_files: - split: train path: "data/zh-TM/*.parquet" - config_name: "zh-TO" data_files: - split: train path: "data/zh-TO/*.parquet" - config_name: "zh-TR" data_files: - split: train path: "data/zh-TR/*.parquet" - config_name: "zh-TT" data_files: - split: train path: "data/zh-TT/*.parquet" - config_name: "zh-TV" data_files: - split: train path: "data/zh-TV/*.parquet" - config_name: "zh-TW" data_files: - split: train path: "data/zh-TW/*.parquet" - config_name: "zh-UA" data_files: - split: train path: "data/zh-UA/*.parquet" - config_name: "zh-UK" data_files: - split: train path: "data/zh-UK/*.parquet" - config_name: "zh-US" data_files: - split: train path: "data/zh-US/*.parquet" - config_name: "zh-VA" data_files: - split: train path: "data/zh-VA/*.parquet" - config_name: "zh-VC" data_files: - split: train path: "data/zh-VC/*.parquet" - config_name: "zh-VG" data_files: - split: train path: "data/zh-VG/*.parquet" - config_name: "zh-VN" data_files: - split: train path: "data/zh-VN/*.parquet" - config_name: "zh-WS" data_files: - split: train path: "data/zh-WS/*.parquet" - config_name: "zh-XX" data_files: - split: train path: "data/zh-XX/*.parquet" - config_name: "zh-ZA" data_files: - split: train path: "data/zh-ZA/*.parquet" - config_name: "zh-ZH" data_files: - split: train path: "data/zh-ZH/*.parquet" - config_name: "zh-ZN" data_files: - split: train path: "data/zh-ZN/*.parquet" - config_name: "all" data_files: - split: train path: "data/**/*.parquet" default: true --- # Common Crawl Regional Corpus A multilingual web corpus with geographic region annotations, extracted from Common Crawl. Each document is labeled with a **locale** (language-region pair). ## Quick Start ```python from datasets import load_dataset # Load a specific locale ds = load_dataset("Yusser/CC-LARD", "es-MX") # Load all data ds = load_dataset("Yusser/CC-LARD", "all") # Access the data for row in ds["train"]: print(row["text"][:100], row["locale"], row["region_source"]) ``` ## Dataset Statistics - **Total documents**: 13,150,911 - **Languages**: 140 - **Regions**: 349 - **Locales**: 4334 ## Columns | Column | Type | Description | |--------|------|-------------| | `text` | string | Document text | | `lang` | string | ISO 639-1 language code | | `region` | string | ISO 3166-1 alpha-2 region code | | `locale` | string | Combined lang-region (e.g., es-MX) | | `url` | string | Source URL | | `lang_confidence` | float | fastText LID confidence | | `region_source` | string | Signal used: html_lang, http_header, tld, none | | `warc_date` | string | Crawl timestamp | | `text_length` | int | Character count | | `title` | string | Page title | | `text_hash` | string | SHA-256 of normalized text | ## Available Locales 4334 locales across 140 languages. ## License Derived from Common Crawl data, released under CC-BY-4.0.
提供机构:
Yusser
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作