WikiQuality/frac_unique_words_lo
收藏Hugging Face2024-09-03 更新2025-04-26 收录
下载链接:
https://hf-mirror.com/datasets/WikiQuality/frac_unique_words_lo
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: am
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3997982.6528179804
num_examples: 2242
- name: test
num_bytes: 212203.36114421932
num_examples: 119
download_size: 4329922
dataset_size: 4210186.0139622
- config_name: ary
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3227822.7489270386
num_examples: 2207
- name: test
num_bytes: 171117.0193133047
num_examples: 117
download_size: 2135706
dataset_size: 3398939.7682403433
- config_name: bm
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 33462.12532299742
num_examples: 49
- name: test
num_bytes: 2048.701550387597
num_examples: 3
download_size: 147082
dataset_size: 35510.82687338501
- config_name: ee
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 64071.09171270718
num_examples: 73
- name: test
num_bytes: 3510.7447513812153
num_examples: 4
download_size: 230647
dataset_size: 67581.8364640884
- config_name: fon
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 104261.23980424144
num_examples: 122
- name: test
num_bytes: 5982.202283849919
num_examples: 7
download_size: 150008
dataset_size: 110243.44208809137
- config_name: ha
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8611412.986106943
num_examples: 4015
- name: test
num_bytes: 454699.76414811256
num_examples: 212
download_size: 19054771
dataset_size: 9066112.750255056
- config_name: ig
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9615704.837252276
num_examples: 3382
- name: test
num_bytes: 508932.9289970897
num_examples: 179
download_size: 15642651
dataset_size: 10124637.766249366
- config_name: lg
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 717121.1557772771
num_examples: 369
- name: test
num_bytes: 38868.35532668169
num_examples: 20
download_size: 1668875
dataset_size: 755989.5111039588
- config_name: ln
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 146079.03948605453
num_examples: 255
- name: test
num_bytes: 8020.0256972735815
num_examples: 14
download_size: 477693
dataset_size: 154099.06518332812
- config_name: ny
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 166606.30784913353
num_examples: 107
- name: test
num_bytes: 9342.409785932721
num_examples: 6
download_size: 433659
dataset_size: 175948.71763506625
- config_name: om
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 226259.67591564928
num_examples: 124
- name: test
num_bytes: 12772.723640399556
num_examples: 7
download_size: 882315
dataset_size: 239032.39955604883
- config_name: pcm
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 353722.9058219178
num_examples: 237
- name: test
num_bytes: 19402.522260273974
num_examples: 13
download_size: 533133
dataset_size: 373125.4280821918
- config_name: rn
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 36504.82848837209
num_examples: 53
- name: test
num_bytes: 2066.311046511628
num_examples: 3
download_size: 157440
dataset_size: 38571.13953488372
- config_name: rw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1628210.0847390092
num_examples: 1152
- name: test
num_bytes: 86215.985389826
num_examples: 61
download_size: 2935136
dataset_size: 1714426.0701288353
- config_name: sn
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2347848.509154156
num_examples: 2826
- name: test
num_bytes: 123789.60646283413
num_examples: 149
download_size: 2488054
dataset_size: 2471638.11561699
- config_name: so
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1139012.6160427807
num_examples: 793
- name: test
num_bytes: 60326.014973262034
num_examples: 42
download_size: 3391604
dataset_size: 1199338.6310160428
- config_name: sw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6466601.708346557
num_examples: 6251
- name: test
num_bytes: 341381.949088844
num_examples: 330
download_size: 16995680
dataset_size: 6807983.6574354
- config_name: ti
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 16842.75204359673
num_examples: 9
- name: test
num_bytes: 1871.41689373297
num_examples: 1
download_size: 157682
dataset_size: 18714.1689373297
- config_name: tn
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 305254.10282258067
num_examples: 165
- name: test
num_bytes: 16650.22379032258
num_examples: 9
download_size: 740731
dataset_size: 321904.32661290327
- config_name: ts
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 104568.58865248227
num_examples: 95
- name: test
num_bytes: 6604.331914893617
num_examples: 6
download_size: 231043
dataset_size: 111172.92056737588
- config_name: tw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1097216.9911976526
num_examples: 572
- name: test
num_bytes: 59464.5572152574
num_examples: 31
download_size: 1843798
dataset_size: 1156681.54841291
- config_name: wo
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 62594.62924120913
num_examples: 34
- name: test
num_bytes: 3682.0370141887724
num_examples: 2
download_size: 850455
dataset_size: 66276.6662553979
- config_name: yo
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1038744.7297423887
num_examples: 949
- name: test
num_bytes: 54728.38407494145
num_examples: 50
download_size: 3462544
dataset_size: 1093473.1138173302
- config_name: zu
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 344477.3090909091
num_examples: 520
- name: test
num_bytes: 18548.778181818183
num_examples: 28
download_size: 1689857
dataset_size: 363026.0872727273
configs:
- config_name: am
data_files:
- split: train
path: am/train-*
- split: test
path: am/test-*
- config_name: ary
data_files:
- split: train
path: ary/train-*
- split: test
path: ary/test-*
- config_name: bm
data_files:
- split: train
path: bm/train-*
- split: test
path: bm/test-*
- config_name: ee
data_files:
- split: train
path: ee/train-*
- split: test
path: ee/test-*
- config_name: fon
data_files:
- split: train
path: fon/train-*
- split: test
path: fon/test-*
- config_name: ha
data_files:
- split: train
path: ha/train-*
- split: test
path: ha/test-*
- config_name: ig
data_files:
- split: train
path: ig/train-*
- split: test
path: ig/test-*
- config_name: lg
data_files:
- split: train
path: lg/train-*
- split: test
path: lg/test-*
- config_name: ln
data_files:
- split: train
path: ln/train-*
- split: test
path: ln/test-*
- config_name: ny
data_files:
- split: train
path: ny/train-*
- split: test
path: ny/test-*
- config_name: om
data_files:
- split: train
path: om/train-*
- split: test
path: om/test-*
- config_name: pcm
data_files:
- split: train
path: pcm/train-*
- split: test
path: pcm/test-*
- config_name: rn
data_files:
- split: train
path: rn/train-*
- split: test
path: rn/test-*
- config_name: rw
data_files:
- split: train
path: rw/train-*
- split: test
path: rw/test-*
- config_name: sn
data_files:
- split: train
path: sn/train-*
- split: test
path: sn/test-*
- config_name: so
data_files:
- split: train
path: so/train-*
- split: test
path: so/test-*
- config_name: sw
data_files:
- split: train
path: sw/train-*
- split: test
path: sw/test-*
- config_name: ti
data_files:
- split: train
path: ti/train-*
- split: test
path: ti/test-*
- config_name: tn
data_files:
- split: train
path: tn/train-*
- split: test
path: tn/test-*
- config_name: ts
data_files:
- split: train
path: ts/train-*
- split: test
path: ts/test-*
- config_name: tw
data_files:
- split: train
path: tw/train-*
- split: test
path: tw/test-*
- config_name: wo
data_files:
- split: train
path: wo/train-*
- split: test
path: wo/test-*
- config_name: yo
data_files:
- split: train
path: yo/train-*
- split: test
path: yo/test-*
- config_name: zu
data_files:
- split: train
path: zu/train-*
- split: test
path: zu/test-*
---
提供机构:
WikiQuality



