five

loicmagne/tatoeba-bitext-mining

收藏
Hugging Face2024-04-25 更新2024-06-12 收录
下载链接:
https://hf-mirror.com/datasets/loicmagne/tatoeba-bitext-mining
下载链接
链接失效反馈
官方服务:
资源简介:
--- language: - eng - sqi - fry - kur - tur - deu - nld - ron - ang - ido - jav - isl - slv - cym - kaz - est - heb - gla - mar - lat - bel - pms - gle - pes - nob - bul - cbk - hun - uig - rus - spa - hye - tel - afr - mon - arz - hrv - nov - gsw - nds - ukr - uzb - lit - ina - lfn - zsm - ita - cmn - lvs - glg - ceb - bre - ben - swg - arq - kab - fra - por - tat - oci - pol - war - aze - vie - nno - cha - mhr - dan - ell - amh - pam - hsb - srp - epo - kzj - awa - fao - mal - ile - bos - cor - cat - eus - yue - swe - dtp - kat - jpn - csb - xho - orv - ind - tuk - max - swh - hin - dsb - ber - tam - slk - tgl - ast - mkd - khm - ces - tzl - urd - ara - kor - yid - fin - tha - wuu configs: - config_name: default data_files: - split: test path: "test/*" - config_name: sqi-eng data_files: - split: test path: "test/sqi-eng.jsonl.gz" - config_name: fry-eng data_files: - split: test path: "test/fry-eng.jsonl.gz" - config_name: kur-eng data_files: - split: test path: "test/kur-eng.jsonl.gz" - config_name: tur-eng data_files: - split: test path: "test/tur-eng.jsonl.gz" - config_name: deu-eng data_files: - split: test path: "test/deu-eng.jsonl.gz" - config_name: nld-eng data_files: - split: test path: "test/nld-eng.jsonl.gz" - config_name: ron-eng data_files: - split: test path: "test/ron-eng.jsonl.gz" - config_name: ang-eng data_files: - split: test path: "test/ang-eng.jsonl.gz" - config_name: ido-eng data_files: - split: test path: "test/ido-eng.jsonl.gz" - config_name: jav-eng data_files: - split: test path: "test/jav-eng.jsonl.gz" - config_name: isl-eng data_files: - split: test path: "test/isl-eng.jsonl.gz" - config_name: slv-eng data_files: - split: test path: "test/slv-eng.jsonl.gz" - config_name: cym-eng data_files: - split: test path: "test/cym-eng.jsonl.gz" - config_name: kaz-eng data_files: - split: test path: "test/kaz-eng.jsonl.gz" - config_name: est-eng data_files: - split: test path: "test/est-eng.jsonl.gz" - config_name: heb-eng data_files: - split: test path: "test/heb-eng.jsonl.gz" - config_name: gla-eng data_files: - split: test path: "test/gla-eng.jsonl.gz" - config_name: mar-eng data_files: - split: test path: "test/mar-eng.jsonl.gz" - config_name: lat-eng data_files: - split: test path: "test/lat-eng.jsonl.gz" - config_name: bel-eng data_files: - split: test path: "test/bel-eng.jsonl.gz" - config_name: pms-eng data_files: - split: test path: "test/pms-eng.jsonl.gz" - config_name: gle-eng data_files: - split: test path: "test/gle-eng.jsonl.gz" - config_name: pes-eng data_files: - split: test path: "test/pes-eng.jsonl.gz" - config_name: nob-eng data_files: - split: test path: "test/nob-eng.jsonl.gz" - config_name: bul-eng data_files: - split: test path: "test/bul-eng.jsonl.gz" - config_name: cbk-eng data_files: - split: test path: "test/cbk-eng.jsonl.gz" - config_name: hun-eng data_files: - split: test path: "test/hun-eng.jsonl.gz" - config_name: uig-eng data_files: - split: test path: "test/uig-eng.jsonl.gz" - config_name: rus-eng data_files: - split: test path: "test/rus-eng.jsonl.gz" - config_name: spa-eng data_files: - split: test path: "test/spa-eng.jsonl.gz" - config_name: hye-eng data_files: - split: test path: "test/hye-eng.jsonl.gz" - config_name: tel-eng data_files: - split: test path: "test/tel-eng.jsonl.gz" - config_name: afr-eng data_files: - split: test path: "test/afr-eng.jsonl.gz" - config_name: mon-eng data_files: - split: test path: "test/mon-eng.jsonl.gz" - config_name: arz-eng data_files: - split: test path: "test/arz-eng.jsonl.gz" - config_name: hrv-eng data_files: - split: test path: "test/hrv-eng.jsonl.gz" - config_name: nov-eng data_files: - split: test path: "test/nov-eng.jsonl.gz" - config_name: gsw-eng data_files: - split: test path: "test/gsw-eng.jsonl.gz" - config_name: nds-eng data_files: - split: test path: "test/nds-eng.jsonl.gz" - config_name: ukr-eng data_files: - split: test path: "test/ukr-eng.jsonl.gz" - config_name: uzb-eng data_files: - split: test path: "test/uzb-eng.jsonl.gz" - config_name: lit-eng data_files: - split: test path: "test/lit-eng.jsonl.gz" - config_name: ina-eng data_files: - split: test path: "test/ina-eng.jsonl.gz" - config_name: lfn-eng data_files: - split: test path: "test/lfn-eng.jsonl.gz" - config_name: zsm-eng data_files: - split: test path: "test/zsm-eng.jsonl.gz" - config_name: ita-eng data_files: - split: test path: "test/ita-eng.jsonl.gz" - config_name: cmn-eng data_files: - split: test path: "test/cmn-eng.jsonl.gz" - config_name: lvs-eng data_files: - split: test path: "test/lvs-eng.jsonl.gz" - config_name: glg-eng data_files: - split: test path: "test/glg-eng.jsonl.gz" - config_name: ceb-eng data_files: - split: test path: "test/ceb-eng.jsonl.gz" - config_name: bre-eng data_files: - split: test path: "test/bre-eng.jsonl.gz" - config_name: ben-eng data_files: - split: test path: "test/ben-eng.jsonl.gz" - config_name: swg-eng data_files: - split: test path: "test/swg-eng.jsonl.gz" - config_name: arq-eng data_files: - split: test path: "test/arq-eng.jsonl.gz" - config_name: kab-eng data_files: - split: test path: "test/kab-eng.jsonl.gz" - config_name: fra-eng data_files: - split: test path: "test/fra-eng.jsonl.gz" - config_name: por-eng data_files: - split: test path: "test/por-eng.jsonl.gz" - config_name: tat-eng data_files: - split: test path: "test/tat-eng.jsonl.gz" - config_name: oci-eng data_files: - split: test path: "test/oci-eng.jsonl.gz" - config_name: pol-eng data_files: - split: test path: "test/pol-eng.jsonl.gz" - config_name: war-eng data_files: - split: test path: "test/war-eng.jsonl.gz" - config_name: aze-eng data_files: - split: test path: "test/aze-eng.jsonl.gz" - config_name: vie-eng data_files: - split: test path: "test/vie-eng.jsonl.gz" - config_name: nno-eng data_files: - split: test path: "test/nno-eng.jsonl.gz" - config_name: cha-eng data_files: - split: test path: "test/cha-eng.jsonl.gz" - config_name: mhr-eng data_files: - split: test path: "test/mhr-eng.jsonl.gz" - config_name: dan-eng data_files: - split: test path: "test/dan-eng.jsonl.gz" - config_name: ell-eng data_files: - split: test path: "test/ell-eng.jsonl.gz" - config_name: amh-eng data_files: - split: test path: "test/amh-eng.jsonl.gz" - config_name: pam-eng data_files: - split: test path: "test/pam-eng.jsonl.gz" - config_name: hsb-eng data_files: - split: test path: "test/hsb-eng.jsonl.gz" - config_name: srp-eng data_files: - split: test path: "test/srp-eng.jsonl.gz" - config_name: epo-eng data_files: - split: test path: "test/epo-eng.jsonl.gz" - config_name: kzj-eng data_files: - split: test path: "test/kzj-eng.jsonl.gz" - config_name: awa-eng data_files: - split: test path: "test/awa-eng.jsonl.gz" - config_name: fao-eng data_files: - split: test path: "test/fao-eng.jsonl.gz" - config_name: mal-eng data_files: - split: test path: "test/mal-eng.jsonl.gz" - config_name: ile-eng data_files: - split: test path: "test/ile-eng.jsonl.gz" - config_name: bos-eng data_files: - split: test path: "test/bos-eng.jsonl.gz" - config_name: cor-eng data_files: - split: test path: "test/cor-eng.jsonl.gz" - config_name: cat-eng data_files: - split: test path: "test/cat-eng.jsonl.gz" - config_name: eus-eng data_files: - split: test path: "test/eus-eng.jsonl.gz" - config_name: yue-eng data_files: - split: test path: "test/yue-eng.jsonl.gz" - config_name: swe-eng data_files: - split: test path: "test/swe-eng.jsonl.gz" - config_name: dtp-eng data_files: - split: test path: "test/dtp-eng.jsonl.gz" - config_name: kat-eng data_files: - split: test path: "test/kat-eng.jsonl.gz" - config_name: jpn-eng data_files: - split: test path: "test/jpn-eng.jsonl.gz" - config_name: csb-eng data_files: - split: test path: "test/csb-eng.jsonl.gz" - config_name: xho-eng data_files: - split: test path: "test/xho-eng.jsonl.gz" - config_name: orv-eng data_files: - split: test path: "test/orv-eng.jsonl.gz" - config_name: ind-eng data_files: - split: test path: "test/ind-eng.jsonl.gz" - config_name: tuk-eng data_files: - split: test path: "test/tuk-eng.jsonl.gz" - config_name: max-eng data_files: - split: test path: "test/max-eng.jsonl.gz" - config_name: swh-eng data_files: - split: test path: "test/swh-eng.jsonl.gz" - config_name: hin-eng data_files: - split: test path: "test/hin-eng.jsonl.gz" - config_name: dsb-eng data_files: - split: test path: "test/dsb-eng.jsonl.gz" - config_name: ber-eng data_files: - split: test path: "test/ber-eng.jsonl.gz" - config_name: tam-eng data_files: - split: test path: "test/tam-eng.jsonl.gz" - config_name: slk-eng data_files: - split: test path: "test/slk-eng.jsonl.gz" - config_name: tgl-eng data_files: - split: test path: "test/tgl-eng.jsonl.gz" - config_name: ast-eng data_files: - split: test path: "test/ast-eng.jsonl.gz" - config_name: mkd-eng data_files: - split: test path: "test/mkd-eng.jsonl.gz" - config_name: khm-eng data_files: - split: test path: "test/khm-eng.jsonl.gz" - config_name: ces-eng data_files: - split: test path: "test/ces-eng.jsonl.gz" - config_name: tzl-eng data_files: - split: test path: "test/tzl-eng.jsonl.gz" - config_name: urd-eng data_files: - split: test path: "test/urd-eng.jsonl.gz" - config_name: ara-eng data_files: - split: test path: "test/ara-eng.jsonl.gz" - config_name: kor-eng data_files: - split: test path: "test/kor-eng.jsonl.gz" - config_name: yid-eng data_files: - split: test path: "test/yid-eng.jsonl.gz" - config_name: fin-eng data_files: - split: test path: "test/fin-eng.jsonl.gz" - config_name: tha-eng data_files: - split: test path: "test/tha-eng.jsonl.gz" - config_name: wuu-eng data_files: - split: test path: "test/wuu-eng.jsonl.gz" ---
提供机构:
loicmagne
原始信息汇总

数据集概述

语言支持

数据集支持多种语言,包括但不限于:

  • 英语(eng)
  • 阿尔巴尼亚语(sqi)
  • 弗里斯兰语(fry)
  • 库尔德语(kur)
  • 土耳其语(tur)
  • 德语(deu)
  • 荷兰语(nld)
  • 罗马尼亚语(ron)
  • 古英语(ang)
  • 世界语(ido)
  • 爪哇语(jav)
  • 冰岛语(isl)
  • 斯洛文尼亚语(slv)
  • 威尔士语(cym)
  • 哈萨克语(kaz)
  • 爱沙尼亚语(est)
  • 希伯来语(heb)
  • 苏格兰盖尔语(gla)
  • 马拉地语(mar)
  • 拉丁语(lat)
  • 白俄罗斯语(bel)
  • 皮埃蒙特语(pms)
  • 爱尔兰语(gle)
  • 波斯语(pes)
  • 挪威语(nob)
  • 保加利亚语(bul)
  • 查莫罗语(cbk)
  • 匈牙利语(hun)
  • 维吾尔语(uig)
  • 俄语(rus)
  • 西班牙语(spa)
  • 亚美尼亚语(hye)
  • 泰卢固语(tel)
  • 阿非利卡语(afr)
  • 蒙古语(mon)
  • 埃及阿拉伯语(arz)
  • 克罗地亚语(hrv)
  • 新挪威语(nov)
  • 瑞士德语(gsw)
  • 低地德语(nds)
  • 乌克兰语(ukr)
  • 乌兹别克语(uzb)
  • 立陶宛语(lit)
  • 印度尼西亚语(ina)
  • 洛林语(lfn)
  • 桑戈语(zsm)
  • 意大利语(ita)
  • 官话(cmn)
  • 拉脱维亚语(lvs)
  • 加利西亚语(glg)
  • 宿务语(ceb)
  • 布列塔尼语(bre)
  • 孟加拉语(ben)
  • 西弗里斯兰语(swg)
  • 阿拉伯埃及方言(arq)
  • 卡拜尔语(kab)
  • 法语(fra)
  • 葡萄牙语(por)
  • 塔塔尔语(tat)
  • 奥克西唐语(oci)
  • 波兰语(pol)
  • 瓦瑞语(war)
  • 阿塞拜疆语(aze)
  • 越南语(vie)
  • 新挪威语(nno)
  • 查瓦卡诺语(cha)
  • 马里语(mhr)
  • 丹麦语(dan)
  • 希腊语(ell)
  • 阿姆哈拉语(amh)
  • 邦板牙语(pam)
  • 上索布语(hsb)
  • 塞尔维亚语(srp)
  • 世界语(epo)
  • 哈萨克语(kzj)
  • 阿瓦德语(awa)
  • 奥克语(fao)
  • 马拉雅拉姆语(mal)
  • 国际语(ile)
  • 波斯尼亚语(bos)
  • 科西嘉语(cor)
  • 加泰罗尼亚语(cat)
  • 巴斯克语(eus)
  • 粤语(yue)
  • 瑞典语(swe)
  • 多尔普语(dtp)
  • 格鲁吉亚语(kat)
  • 日语(jpn)
  • 卡舒比语(csb)
  • 科萨语(xho)
  • 中古俄语(orv)
  • 印度尼西亚语(ind)
  • 图克曼语(tuk)
  • 马克斯语(max)
  • 西弗里斯兰语(swh)
  • 印地语(hin)
  • 下索布语(dsb)
  • 柏柏尔语(ber)
  • 泰米尔语(tam)
  • 斯洛伐克语(slk)
  • 他加禄语(tgl)
  • 阿斯图里亚斯语(ast)
  • 马其顿语(mkd)
  • 高棉语(khm)
  • 捷克语(ces)
  • 塔兹语(tzl)
  • 乌尔都语(urd)
  • 阿拉伯语(ara)
  • 韩语(kor)
  • 意第绪语(yid)
  • 芬兰语(fin)
  • 泰语(tha)
  • 吴语(wuu)

数据配置

数据集包含多个配置,每个配置对应不同的语言组合,所有配置均包含测试数据。以下是部分配置示例:

  • config_name: default
    • split: test
    • path: "test/*"
  • config_name: sqi-eng
    • split: test
    • path: "test/sqi-eng.jsonl.gz"
  • config_name: fry-eng
    • split: test
    • path: "test/fry-eng.jsonl.gz"
  • config_name: kur-eng
    • split: test
    • path: "test/kur-eng.jsonl.gz"
  • config_name: tur-eng
    • split: test
    • path: "test/tur-eng.jsonl.gz"
  • config_name: deu-eng
    • split: test
    • path: "test/deu-eng.jsonl.gz"
  • config_name: nld-eng
    • split: test
    • path: "test/nld-eng.jsonl.gz"
  • config_name: ron-eng
    • split: test
    • path: "test/ron-eng.jsonl.gz"
  • config_name: ang-eng
    • split: test
    • path: "test/ang-eng.jsonl.gz"
  • config_name: ido-eng
    • split: test
    • path: "test/ido-eng.jsonl.gz"
  • config_name: jav-eng
    • split: test
    • path: "test/jav-eng.jsonl.gz"
  • config_name: isl-eng
    • split: test
    • path: "test/isl-eng.jsonl.gz"
  • config_name: slv-eng
    • split: test
    • path: "test/slv-eng.jsonl.gz"
  • config_name: cym-eng
    • split: test
    • path: "test/cym-eng.jsonl.gz"
  • config_name: kaz-eng
    • split: test
    • path: "test/kaz-eng.jsonl.gz"
  • config_name: est-eng
    • split: test
    • path: "test/est-eng.jsonl.gz"
  • config_name: heb-eng
    • split: test
    • path: "test/heb-eng.jsonl.gz"
  • config_name: gla-eng
    • split: test
    • path: "test/gla-eng.jsonl.gz"
  • config_name: mar-eng
    • split: test
    • path: "test/mar-eng.jsonl.gz"
  • config_name: lat-eng
    • split: test
    • path: "test/lat-eng.jsonl.gz"
  • config_name: bel-eng
    • split: test
    • path: "test/bel-eng.jsonl.gz"
  • config_name: pms-eng
    • split: test
    • path: "test/pms-eng.jsonl.gz"
  • config_name: gle-eng
    • split: test
    • path: "test/gle-eng.jsonl.gz"
  • config_name: pes-eng
    • split: test
    • path: "test/pes-eng.jsonl.gz"
  • config_name: nob-eng
    • split: test
    • path: "test/nob-eng.jsonl.gz"
  • config_name: bul-eng
    • split: test
    • path: "test/bul-eng.jsonl.gz"
  • config_name: cbk-eng
    • split: test
    • path: "test/cbk-eng.jsonl.gz"
  • config_name: hun-eng
    • split: test
    • path: "test/hun-eng.jsonl.gz"
  • config_name: uig-eng
    • split: test
    • path: "test/uig-eng.jsonl.gz"
  • config_name: rus-eng
    • split: test
    • path: "test/rus-eng.jsonl.gz"
  • config_name: spa-eng
    • split: test
    • path: "test/spa-eng.jsonl.gz"
  • config_name: hye-eng
    • split: test
    • path: "test/hye-eng.jsonl.gz"
  • config_name: tel-eng
    • split: test
    • path: "test/tel-eng.jsonl.gz"
  • config_name: afr-eng
    • split: test
    • path: "test/afr-eng.jsonl.gz"
  • config_name: mon-eng
    • split: test
    • path: "test/mon-eng.jsonl.gz"
  • config_name: arz-eng
    • split: test
    • path: "test/arz-eng.jsonl.gz"
  • config_name: hrv-eng
    • split: test
    • path: "test/hrv-eng.jsonl.gz"
  • config_name: nov-eng
    • split: test
    • path: "test/nov-eng.jsonl.gz"
  • config_name: gsw-eng
    • split: test
    • path: "test/gsw-eng.jsonl.gz"
  • config_name: nds-eng
    • split: test
    • path: "test/nds-eng.jsonl.gz"
  • config_name: ukr-eng
    • split: test
    • path: "test/ukr-eng.jsonl.gz"
  • config_name: uzb-eng
    • split: test
    • path: "test/uzb-eng.jsonl.gz"
  • config_name: lit-eng
    • split: test
    • path: "test/lit-eng.jsonl.gz"
  • config_name: ina-eng
    • split: test
    • path: "test/ina-eng.jsonl.gz"
  • config_name: lfn-eng
    • split: test
    • path: "test/lfn-eng.jsonl.gz"
  • config_name: zsm-eng
    • split: test
    • path: "test/zsm-eng.jsonl.gz"
  • config_name: ita-eng
    • split: test
    • path: "test/ita-eng.jsonl.gz"
  • config_name: cmn-eng
    • split: test
    • path: "test/cmn-eng.jsonl.gz"
  • config_name: lvs-eng
    • split: test
    • path: "test/lvs-eng.jsonl.gz"
  • config_name: glg-eng
    • split: test
    • path: "test/glg-eng.jsonl.gz"
  • config_name: ceb-eng
    • split: test
    • path: "test/ceb-eng.jsonl.gz"
  • config_name: bre-eng
    • split: test
    • path: "test/bre-eng.jsonl.gz"
  • config_name: ben-eng
    • split: test
    • path: "test/ben-eng.jsonl.gz"
  • config_name: swg-eng
    • split: test
    • path: "test/swg-eng.jsonl.gz"
  • config_name: arq-eng
    • split: test
    • path: "test/arq-eng.jsonl.gz"
  • config_name: kab-eng
    • split: test
    • path: "test/kab-eng.jsonl.gz"
  • config_name: fra-eng
    • split: test
    • path: "test/fra-eng.jsonl.gz"
  • config_name: por-eng
    • split: test
    • path: "test/por-eng.jsonl.gz"
  • config_name: tat-eng
    • split: test
    • path: "test/tat-eng.jsonl.gz"
  • config_name: oci-eng
    • split: test
    • path: "test/oci-eng.jsonl.gz"
  • config_name: pol-eng
    • split: test
    • path: "test/pol-eng.jsonl.gz"
  • config_name: war-eng
    • split: test
    • path: "test/war-eng.jsonl.gz"
  • config_name: aze-eng
    • split: test
    • path: "test/aze-eng.jsonl.gz"
  • config_name: vie-eng
    • split: test
    • path: "test/vie-eng.jsonl.gz"
  • config_name: nno-eng
    • split: test
    • path: "test/nno-eng.jsonl.gz"
  • config_name: cha-eng
    • split: test
    • path: "test/cha-eng.jsonl.gz"
  • config_name: mhr-eng
    • split: test
    • path: "test/mhr-eng.jsonl.gz"
  • config_name: dan-eng
    • split: test
    • path: "test/dan-eng.jsonl.gz"
  • config_name: ell-eng
    • split: test
    • path: "test/ell-eng.jsonl.gz"
  • config_name: amh-eng
    • split: test
    • path: "test/amh-eng.jsonl.gz"
  • config_name: pam-eng
    • split: test
    • path: "test/pam-eng.jsonl.gz"
  • config_name: hsb-eng
    • split: test
    • path: "test/hsb-eng.jsonl.gz"
  • config_name: srp-eng
    • split: test
    • path: "test/srp-eng.jsonl.gz"
  • config_name: epo-eng
    • split: test
    • path: "test/epo-eng.jsonl.gz"
  • config_name: kzj-eng
    • split: test
    • path: "test/kzj-eng.jsonl.gz"
  • config_name: awa-eng
    • split: test
    • path: "test/awa-eng.jsonl.gz"
  • config_name: fao-eng
    • split: test
    • path: "test/fao-eng.jsonl.gz"
  • `config_name: mal
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作