loicmagne/tatoeba-bitext-mining
收藏Hugging Face2024-04-25 更新2024-06-12 收录
下载链接:
https://hf-mirror.com/datasets/loicmagne/tatoeba-bitext-mining
下载链接
链接失效反馈官方服务:
资源简介:
---
language:
- eng
- sqi
- fry
- kur
- tur
- deu
- nld
- ron
- ang
- ido
- jav
- isl
- slv
- cym
- kaz
- est
- heb
- gla
- mar
- lat
- bel
- pms
- gle
- pes
- nob
- bul
- cbk
- hun
- uig
- rus
- spa
- hye
- tel
- afr
- mon
- arz
- hrv
- nov
- gsw
- nds
- ukr
- uzb
- lit
- ina
- lfn
- zsm
- ita
- cmn
- lvs
- glg
- ceb
- bre
- ben
- swg
- arq
- kab
- fra
- por
- tat
- oci
- pol
- war
- aze
- vie
- nno
- cha
- mhr
- dan
- ell
- amh
- pam
- hsb
- srp
- epo
- kzj
- awa
- fao
- mal
- ile
- bos
- cor
- cat
- eus
- yue
- swe
- dtp
- kat
- jpn
- csb
- xho
- orv
- ind
- tuk
- max
- swh
- hin
- dsb
- ber
- tam
- slk
- tgl
- ast
- mkd
- khm
- ces
- tzl
- urd
- ara
- kor
- yid
- fin
- tha
- wuu
configs:
- config_name: default
data_files:
- split: test
path: "test/*"
- config_name: sqi-eng
data_files:
- split: test
path: "test/sqi-eng.jsonl.gz"
- config_name: fry-eng
data_files:
- split: test
path: "test/fry-eng.jsonl.gz"
- config_name: kur-eng
data_files:
- split: test
path: "test/kur-eng.jsonl.gz"
- config_name: tur-eng
data_files:
- split: test
path: "test/tur-eng.jsonl.gz"
- config_name: deu-eng
data_files:
- split: test
path: "test/deu-eng.jsonl.gz"
- config_name: nld-eng
data_files:
- split: test
path: "test/nld-eng.jsonl.gz"
- config_name: ron-eng
data_files:
- split: test
path: "test/ron-eng.jsonl.gz"
- config_name: ang-eng
data_files:
- split: test
path: "test/ang-eng.jsonl.gz"
- config_name: ido-eng
data_files:
- split: test
path: "test/ido-eng.jsonl.gz"
- config_name: jav-eng
data_files:
- split: test
path: "test/jav-eng.jsonl.gz"
- config_name: isl-eng
data_files:
- split: test
path: "test/isl-eng.jsonl.gz"
- config_name: slv-eng
data_files:
- split: test
path: "test/slv-eng.jsonl.gz"
- config_name: cym-eng
data_files:
- split: test
path: "test/cym-eng.jsonl.gz"
- config_name: kaz-eng
data_files:
- split: test
path: "test/kaz-eng.jsonl.gz"
- config_name: est-eng
data_files:
- split: test
path: "test/est-eng.jsonl.gz"
- config_name: heb-eng
data_files:
- split: test
path: "test/heb-eng.jsonl.gz"
- config_name: gla-eng
data_files:
- split: test
path: "test/gla-eng.jsonl.gz"
- config_name: mar-eng
data_files:
- split: test
path: "test/mar-eng.jsonl.gz"
- config_name: lat-eng
data_files:
- split: test
path: "test/lat-eng.jsonl.gz"
- config_name: bel-eng
data_files:
- split: test
path: "test/bel-eng.jsonl.gz"
- config_name: pms-eng
data_files:
- split: test
path: "test/pms-eng.jsonl.gz"
- config_name: gle-eng
data_files:
- split: test
path: "test/gle-eng.jsonl.gz"
- config_name: pes-eng
data_files:
- split: test
path: "test/pes-eng.jsonl.gz"
- config_name: nob-eng
data_files:
- split: test
path: "test/nob-eng.jsonl.gz"
- config_name: bul-eng
data_files:
- split: test
path: "test/bul-eng.jsonl.gz"
- config_name: cbk-eng
data_files:
- split: test
path: "test/cbk-eng.jsonl.gz"
- config_name: hun-eng
data_files:
- split: test
path: "test/hun-eng.jsonl.gz"
- config_name: uig-eng
data_files:
- split: test
path: "test/uig-eng.jsonl.gz"
- config_name: rus-eng
data_files:
- split: test
path: "test/rus-eng.jsonl.gz"
- config_name: spa-eng
data_files:
- split: test
path: "test/spa-eng.jsonl.gz"
- config_name: hye-eng
data_files:
- split: test
path: "test/hye-eng.jsonl.gz"
- config_name: tel-eng
data_files:
- split: test
path: "test/tel-eng.jsonl.gz"
- config_name: afr-eng
data_files:
- split: test
path: "test/afr-eng.jsonl.gz"
- config_name: mon-eng
data_files:
- split: test
path: "test/mon-eng.jsonl.gz"
- config_name: arz-eng
data_files:
- split: test
path: "test/arz-eng.jsonl.gz"
- config_name: hrv-eng
data_files:
- split: test
path: "test/hrv-eng.jsonl.gz"
- config_name: nov-eng
data_files:
- split: test
path: "test/nov-eng.jsonl.gz"
- config_name: gsw-eng
data_files:
- split: test
path: "test/gsw-eng.jsonl.gz"
- config_name: nds-eng
data_files:
- split: test
path: "test/nds-eng.jsonl.gz"
- config_name: ukr-eng
data_files:
- split: test
path: "test/ukr-eng.jsonl.gz"
- config_name: uzb-eng
data_files:
- split: test
path: "test/uzb-eng.jsonl.gz"
- config_name: lit-eng
data_files:
- split: test
path: "test/lit-eng.jsonl.gz"
- config_name: ina-eng
data_files:
- split: test
path: "test/ina-eng.jsonl.gz"
- config_name: lfn-eng
data_files:
- split: test
path: "test/lfn-eng.jsonl.gz"
- config_name: zsm-eng
data_files:
- split: test
path: "test/zsm-eng.jsonl.gz"
- config_name: ita-eng
data_files:
- split: test
path: "test/ita-eng.jsonl.gz"
- config_name: cmn-eng
data_files:
- split: test
path: "test/cmn-eng.jsonl.gz"
- config_name: lvs-eng
data_files:
- split: test
path: "test/lvs-eng.jsonl.gz"
- config_name: glg-eng
data_files:
- split: test
path: "test/glg-eng.jsonl.gz"
- config_name: ceb-eng
data_files:
- split: test
path: "test/ceb-eng.jsonl.gz"
- config_name: bre-eng
data_files:
- split: test
path: "test/bre-eng.jsonl.gz"
- config_name: ben-eng
data_files:
- split: test
path: "test/ben-eng.jsonl.gz"
- config_name: swg-eng
data_files:
- split: test
path: "test/swg-eng.jsonl.gz"
- config_name: arq-eng
data_files:
- split: test
path: "test/arq-eng.jsonl.gz"
- config_name: kab-eng
data_files:
- split: test
path: "test/kab-eng.jsonl.gz"
- config_name: fra-eng
data_files:
- split: test
path: "test/fra-eng.jsonl.gz"
- config_name: por-eng
data_files:
- split: test
path: "test/por-eng.jsonl.gz"
- config_name: tat-eng
data_files:
- split: test
path: "test/tat-eng.jsonl.gz"
- config_name: oci-eng
data_files:
- split: test
path: "test/oci-eng.jsonl.gz"
- config_name: pol-eng
data_files:
- split: test
path: "test/pol-eng.jsonl.gz"
- config_name: war-eng
data_files:
- split: test
path: "test/war-eng.jsonl.gz"
- config_name: aze-eng
data_files:
- split: test
path: "test/aze-eng.jsonl.gz"
- config_name: vie-eng
data_files:
- split: test
path: "test/vie-eng.jsonl.gz"
- config_name: nno-eng
data_files:
- split: test
path: "test/nno-eng.jsonl.gz"
- config_name: cha-eng
data_files:
- split: test
path: "test/cha-eng.jsonl.gz"
- config_name: mhr-eng
data_files:
- split: test
path: "test/mhr-eng.jsonl.gz"
- config_name: dan-eng
data_files:
- split: test
path: "test/dan-eng.jsonl.gz"
- config_name: ell-eng
data_files:
- split: test
path: "test/ell-eng.jsonl.gz"
- config_name: amh-eng
data_files:
- split: test
path: "test/amh-eng.jsonl.gz"
- config_name: pam-eng
data_files:
- split: test
path: "test/pam-eng.jsonl.gz"
- config_name: hsb-eng
data_files:
- split: test
path: "test/hsb-eng.jsonl.gz"
- config_name: srp-eng
data_files:
- split: test
path: "test/srp-eng.jsonl.gz"
- config_name: epo-eng
data_files:
- split: test
path: "test/epo-eng.jsonl.gz"
- config_name: kzj-eng
data_files:
- split: test
path: "test/kzj-eng.jsonl.gz"
- config_name: awa-eng
data_files:
- split: test
path: "test/awa-eng.jsonl.gz"
- config_name: fao-eng
data_files:
- split: test
path: "test/fao-eng.jsonl.gz"
- config_name: mal-eng
data_files:
- split: test
path: "test/mal-eng.jsonl.gz"
- config_name: ile-eng
data_files:
- split: test
path: "test/ile-eng.jsonl.gz"
- config_name: bos-eng
data_files:
- split: test
path: "test/bos-eng.jsonl.gz"
- config_name: cor-eng
data_files:
- split: test
path: "test/cor-eng.jsonl.gz"
- config_name: cat-eng
data_files:
- split: test
path: "test/cat-eng.jsonl.gz"
- config_name: eus-eng
data_files:
- split: test
path: "test/eus-eng.jsonl.gz"
- config_name: yue-eng
data_files:
- split: test
path: "test/yue-eng.jsonl.gz"
- config_name: swe-eng
data_files:
- split: test
path: "test/swe-eng.jsonl.gz"
- config_name: dtp-eng
data_files:
- split: test
path: "test/dtp-eng.jsonl.gz"
- config_name: kat-eng
data_files:
- split: test
path: "test/kat-eng.jsonl.gz"
- config_name: jpn-eng
data_files:
- split: test
path: "test/jpn-eng.jsonl.gz"
- config_name: csb-eng
data_files:
- split: test
path: "test/csb-eng.jsonl.gz"
- config_name: xho-eng
data_files:
- split: test
path: "test/xho-eng.jsonl.gz"
- config_name: orv-eng
data_files:
- split: test
path: "test/orv-eng.jsonl.gz"
- config_name: ind-eng
data_files:
- split: test
path: "test/ind-eng.jsonl.gz"
- config_name: tuk-eng
data_files:
- split: test
path: "test/tuk-eng.jsonl.gz"
- config_name: max-eng
data_files:
- split: test
path: "test/max-eng.jsonl.gz"
- config_name: swh-eng
data_files:
- split: test
path: "test/swh-eng.jsonl.gz"
- config_name: hin-eng
data_files:
- split: test
path: "test/hin-eng.jsonl.gz"
- config_name: dsb-eng
data_files:
- split: test
path: "test/dsb-eng.jsonl.gz"
- config_name: ber-eng
data_files:
- split: test
path: "test/ber-eng.jsonl.gz"
- config_name: tam-eng
data_files:
- split: test
path: "test/tam-eng.jsonl.gz"
- config_name: slk-eng
data_files:
- split: test
path: "test/slk-eng.jsonl.gz"
- config_name: tgl-eng
data_files:
- split: test
path: "test/tgl-eng.jsonl.gz"
- config_name: ast-eng
data_files:
- split: test
path: "test/ast-eng.jsonl.gz"
- config_name: mkd-eng
data_files:
- split: test
path: "test/mkd-eng.jsonl.gz"
- config_name: khm-eng
data_files:
- split: test
path: "test/khm-eng.jsonl.gz"
- config_name: ces-eng
data_files:
- split: test
path: "test/ces-eng.jsonl.gz"
- config_name: tzl-eng
data_files:
- split: test
path: "test/tzl-eng.jsonl.gz"
- config_name: urd-eng
data_files:
- split: test
path: "test/urd-eng.jsonl.gz"
- config_name: ara-eng
data_files:
- split: test
path: "test/ara-eng.jsonl.gz"
- config_name: kor-eng
data_files:
- split: test
path: "test/kor-eng.jsonl.gz"
- config_name: yid-eng
data_files:
- split: test
path: "test/yid-eng.jsonl.gz"
- config_name: fin-eng
data_files:
- split: test
path: "test/fin-eng.jsonl.gz"
- config_name: tha-eng
data_files:
- split: test
path: "test/tha-eng.jsonl.gz"
- config_name: wuu-eng
data_files:
- split: test
path: "test/wuu-eng.jsonl.gz"
---
提供机构:
loicmagne
原始信息汇总
数据集概述
语言支持
数据集支持多种语言,包括但不限于:
- 英语(eng)
- 阿尔巴尼亚语(sqi)
- 弗里斯兰语(fry)
- 库尔德语(kur)
- 土耳其语(tur)
- 德语(deu)
- 荷兰语(nld)
- 罗马尼亚语(ron)
- 古英语(ang)
- 世界语(ido)
- 爪哇语(jav)
- 冰岛语(isl)
- 斯洛文尼亚语(slv)
- 威尔士语(cym)
- 哈萨克语(kaz)
- 爱沙尼亚语(est)
- 希伯来语(heb)
- 苏格兰盖尔语(gla)
- 马拉地语(mar)
- 拉丁语(lat)
- 白俄罗斯语(bel)
- 皮埃蒙特语(pms)
- 爱尔兰语(gle)
- 波斯语(pes)
- 挪威语(nob)
- 保加利亚语(bul)
- 查莫罗语(cbk)
- 匈牙利语(hun)
- 维吾尔语(uig)
- 俄语(rus)
- 西班牙语(spa)
- 亚美尼亚语(hye)
- 泰卢固语(tel)
- 阿非利卡语(afr)
- 蒙古语(mon)
- 埃及阿拉伯语(arz)
- 克罗地亚语(hrv)
- 新挪威语(nov)
- 瑞士德语(gsw)
- 低地德语(nds)
- 乌克兰语(ukr)
- 乌兹别克语(uzb)
- 立陶宛语(lit)
- 印度尼西亚语(ina)
- 洛林语(lfn)
- 桑戈语(zsm)
- 意大利语(ita)
- 官话(cmn)
- 拉脱维亚语(lvs)
- 加利西亚语(glg)
- 宿务语(ceb)
- 布列塔尼语(bre)
- 孟加拉语(ben)
- 西弗里斯兰语(swg)
- 阿拉伯埃及方言(arq)
- 卡拜尔语(kab)
- 法语(fra)
- 葡萄牙语(por)
- 塔塔尔语(tat)
- 奥克西唐语(oci)
- 波兰语(pol)
- 瓦瑞语(war)
- 阿塞拜疆语(aze)
- 越南语(vie)
- 新挪威语(nno)
- 查瓦卡诺语(cha)
- 马里语(mhr)
- 丹麦语(dan)
- 希腊语(ell)
- 阿姆哈拉语(amh)
- 邦板牙语(pam)
- 上索布语(hsb)
- 塞尔维亚语(srp)
- 世界语(epo)
- 哈萨克语(kzj)
- 阿瓦德语(awa)
- 奥克语(fao)
- 马拉雅拉姆语(mal)
- 国际语(ile)
- 波斯尼亚语(bos)
- 科西嘉语(cor)
- 加泰罗尼亚语(cat)
- 巴斯克语(eus)
- 粤语(yue)
- 瑞典语(swe)
- 多尔普语(dtp)
- 格鲁吉亚语(kat)
- 日语(jpn)
- 卡舒比语(csb)
- 科萨语(xho)
- 中古俄语(orv)
- 印度尼西亚语(ind)
- 图克曼语(tuk)
- 马克斯语(max)
- 西弗里斯兰语(swh)
- 印地语(hin)
- 下索布语(dsb)
- 柏柏尔语(ber)
- 泰米尔语(tam)
- 斯洛伐克语(slk)
- 他加禄语(tgl)
- 阿斯图里亚斯语(ast)
- 马其顿语(mkd)
- 高棉语(khm)
- 捷克语(ces)
- 塔兹语(tzl)
- 乌尔都语(urd)
- 阿拉伯语(ara)
- 韩语(kor)
- 意第绪语(yid)
- 芬兰语(fin)
- 泰语(tha)
- 吴语(wuu)
数据配置
数据集包含多个配置,每个配置对应不同的语言组合,所有配置均包含测试数据。以下是部分配置示例:
config_name: defaultsplit: testpath: "test/*"
config_name: sqi-engsplit: testpath: "test/sqi-eng.jsonl.gz"
config_name: fry-engsplit: testpath: "test/fry-eng.jsonl.gz"
config_name: kur-engsplit: testpath: "test/kur-eng.jsonl.gz"
config_name: tur-engsplit: testpath: "test/tur-eng.jsonl.gz"
config_name: deu-engsplit: testpath: "test/deu-eng.jsonl.gz"
config_name: nld-engsplit: testpath: "test/nld-eng.jsonl.gz"
config_name: ron-engsplit: testpath: "test/ron-eng.jsonl.gz"
config_name: ang-engsplit: testpath: "test/ang-eng.jsonl.gz"
config_name: ido-engsplit: testpath: "test/ido-eng.jsonl.gz"
config_name: jav-engsplit: testpath: "test/jav-eng.jsonl.gz"
config_name: isl-engsplit: testpath: "test/isl-eng.jsonl.gz"
config_name: slv-engsplit: testpath: "test/slv-eng.jsonl.gz"
config_name: cym-engsplit: testpath: "test/cym-eng.jsonl.gz"
config_name: kaz-engsplit: testpath: "test/kaz-eng.jsonl.gz"
config_name: est-engsplit: testpath: "test/est-eng.jsonl.gz"
config_name: heb-engsplit: testpath: "test/heb-eng.jsonl.gz"
config_name: gla-engsplit: testpath: "test/gla-eng.jsonl.gz"
config_name: mar-engsplit: testpath: "test/mar-eng.jsonl.gz"
config_name: lat-engsplit: testpath: "test/lat-eng.jsonl.gz"
config_name: bel-engsplit: testpath: "test/bel-eng.jsonl.gz"
config_name: pms-engsplit: testpath: "test/pms-eng.jsonl.gz"
config_name: gle-engsplit: testpath: "test/gle-eng.jsonl.gz"
config_name: pes-engsplit: testpath: "test/pes-eng.jsonl.gz"
config_name: nob-engsplit: testpath: "test/nob-eng.jsonl.gz"
config_name: bul-engsplit: testpath: "test/bul-eng.jsonl.gz"
config_name: cbk-engsplit: testpath: "test/cbk-eng.jsonl.gz"
config_name: hun-engsplit: testpath: "test/hun-eng.jsonl.gz"
config_name: uig-engsplit: testpath: "test/uig-eng.jsonl.gz"
config_name: rus-engsplit: testpath: "test/rus-eng.jsonl.gz"
config_name: spa-engsplit: testpath: "test/spa-eng.jsonl.gz"
config_name: hye-engsplit: testpath: "test/hye-eng.jsonl.gz"
config_name: tel-engsplit: testpath: "test/tel-eng.jsonl.gz"
config_name: afr-engsplit: testpath: "test/afr-eng.jsonl.gz"
config_name: mon-engsplit: testpath: "test/mon-eng.jsonl.gz"
config_name: arz-engsplit: testpath: "test/arz-eng.jsonl.gz"
config_name: hrv-engsplit: testpath: "test/hrv-eng.jsonl.gz"
config_name: nov-engsplit: testpath: "test/nov-eng.jsonl.gz"
config_name: gsw-engsplit: testpath: "test/gsw-eng.jsonl.gz"
config_name: nds-engsplit: testpath: "test/nds-eng.jsonl.gz"
config_name: ukr-engsplit: testpath: "test/ukr-eng.jsonl.gz"
config_name: uzb-engsplit: testpath: "test/uzb-eng.jsonl.gz"
config_name: lit-engsplit: testpath: "test/lit-eng.jsonl.gz"
config_name: ina-engsplit: testpath: "test/ina-eng.jsonl.gz"
config_name: lfn-engsplit: testpath: "test/lfn-eng.jsonl.gz"
config_name: zsm-engsplit: testpath: "test/zsm-eng.jsonl.gz"
config_name: ita-engsplit: testpath: "test/ita-eng.jsonl.gz"
config_name: cmn-engsplit: testpath: "test/cmn-eng.jsonl.gz"
config_name: lvs-engsplit: testpath: "test/lvs-eng.jsonl.gz"
config_name: glg-engsplit: testpath: "test/glg-eng.jsonl.gz"
config_name: ceb-engsplit: testpath: "test/ceb-eng.jsonl.gz"
config_name: bre-engsplit: testpath: "test/bre-eng.jsonl.gz"
config_name: ben-engsplit: testpath: "test/ben-eng.jsonl.gz"
config_name: swg-engsplit: testpath: "test/swg-eng.jsonl.gz"
config_name: arq-engsplit: testpath: "test/arq-eng.jsonl.gz"
config_name: kab-engsplit: testpath: "test/kab-eng.jsonl.gz"
config_name: fra-engsplit: testpath: "test/fra-eng.jsonl.gz"
config_name: por-engsplit: testpath: "test/por-eng.jsonl.gz"
config_name: tat-engsplit: testpath: "test/tat-eng.jsonl.gz"
config_name: oci-engsplit: testpath: "test/oci-eng.jsonl.gz"
config_name: pol-engsplit: testpath: "test/pol-eng.jsonl.gz"
config_name: war-engsplit: testpath: "test/war-eng.jsonl.gz"
config_name: aze-engsplit: testpath: "test/aze-eng.jsonl.gz"
config_name: vie-engsplit: testpath: "test/vie-eng.jsonl.gz"
config_name: nno-engsplit: testpath: "test/nno-eng.jsonl.gz"
config_name: cha-engsplit: testpath: "test/cha-eng.jsonl.gz"
config_name: mhr-engsplit: testpath: "test/mhr-eng.jsonl.gz"
config_name: dan-engsplit: testpath: "test/dan-eng.jsonl.gz"
config_name: ell-engsplit: testpath: "test/ell-eng.jsonl.gz"
config_name: amh-engsplit: testpath: "test/amh-eng.jsonl.gz"
config_name: pam-engsplit: testpath: "test/pam-eng.jsonl.gz"
config_name: hsb-engsplit: testpath: "test/hsb-eng.jsonl.gz"
config_name: srp-engsplit: testpath: "test/srp-eng.jsonl.gz"
config_name: epo-engsplit: testpath: "test/epo-eng.jsonl.gz"
config_name: kzj-engsplit: testpath: "test/kzj-eng.jsonl.gz"
config_name: awa-engsplit: testpath: "test/awa-eng.jsonl.gz"
config_name: fao-engsplit: testpath: "test/fao-eng.jsonl.gz"
- `config_name: mal



