WikiQuality/frac_unique_words_hi
收藏Hugging Face2024-09-03 更新2025-04-26 收录
下载链接:
https://hf-mirror.com/datasets/WikiQuality/frac_unique_words_hi
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: am
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 15897419.870594244
num_examples: 8915
- name: test
num_bytes: 838114.1154435553
num_examples: 470
download_size: 5504010
dataset_size: 16735533.9860378
- config_name: ary
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7129875.80472103
num_examples: 4875
- name: test
num_bytes: 375872.4270386266
num_examples: 257
download_size: 2695572
dataset_size: 7505748.231759657
- config_name: bm
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 467786.85400516796
num_examples: 685
- name: test
num_bytes: 25267.319121447028
num_examples: 37
download_size: 175599
dataset_size: 493054.173126615
- config_name: ee
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 689861.3436464089
num_examples: 786
- name: test
num_bytes: 36862.81988950276
num_examples: 42
download_size: 257597
dataset_size: 726724.1635359116
- config_name: fon
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 392261.5497553018
num_examples: 459
- name: test
num_bytes: 21365.00815660685
num_examples: 25
download_size: 158712
dataset_size: 413626.55791190866
- config_name: ha
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 59291133.39674728
num_examples: 27644
- name: test
num_bytes: 3120698.8529976597
num_examples: 1455
download_size: 22501476
dataset_size: 62411832.249744944
- config_name: ig
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 48849031.581558645
num_examples: 17181
- name: test
num_bytes: 2573096.6521919896
num_examples: 905
download_size: 17785163
dataset_size: 51422128.233750634
- config_name: lg
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5017904.672674606
num_examples: 2582
- name: test
num_bytes: 264304.8162214355
num_examples: 136
download_size: 1927000
dataset_size: 5282209.488896041
- config_name: ln
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1589683.6649952992
num_examples: 2775
- name: test
num_bytes: 84210.26982137261
num_examples: 147
download_size: 640665
dataset_size: 1673893.9348166718
- config_name: ny
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1283024.2772680938
num_examples: 824
- name: test
num_bytes: 68511.00509683996
num_examples: 44
download_size: 485831
dataset_size: 1351535.2823649338
- config_name: om
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2895758.9167591566
num_examples: 1587
- name: test
num_bytes: 153272.68368479467
num_examples: 84
download_size: 1036332
dataset_size: 3049031.6004439513
- config_name: pcm
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1301461.493150685
num_examples: 872
- name: test
num_bytes: 68655.07876712328
num_examples: 46
download_size: 590141
dataset_size: 1370116.5719178081
- config_name: rn
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 413262.20930232556
num_examples: 600
- name: test
num_bytes: 22040.6511627907
num_examples: 32
download_size: 166153
dataset_size: 435302.8604651163
- config_name: rw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8480260.858015673
num_examples: 6000
- name: test
num_bytes: 446627.0718554921
num_examples: 316
download_size: 3475929
dataset_size: 8926887.929871166
- config_name: sn
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6273391.398663493
num_examples: 7551
- name: test
num_bytes: 330659.48571951664
num_examples: 398
download_size: 2527700
dataset_size: 6604050.88438301
- config_name: so
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10343038.90053476
num_examples: 7201
- name: test
num_bytes: 544370.4684491978
num_examples: 379
download_size: 4061181
dataset_size: 10887409.368983958
- config_name: sw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 52369025.48098348
num_examples: 50623
- name: test
num_bytes: 2756917.861581119
num_examples: 2665
download_size: 19439664
dataset_size: 55125943.3425646
- config_name: ti
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 634410.3269754768
num_examples: 339
- name: test
num_bytes: 33685.50408719346
num_examples: 18
download_size: 210172
dataset_size: 668095.8310626703
- config_name: tn
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2308831.0322580645
num_examples: 1248
- name: test
num_bytes: 122101.64112903226
num_examples: 66
download_size: 837869
dataset_size: 2430932.6733870967
- config_name: ts
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 630713.6978723404
num_examples: 573
- name: test
num_bytes: 34122.38156028369
num_examples: 31
download_size: 245203
dataset_size: 664836.0794326242
- config_name: tw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5731616.030941584
num_examples: 2988
- name: test
num_bytes: 303077.4206455055
num_examples: 158
download_size: 2122988
dataset_size: 6034693.451587089
- config_name: wo
features:
- name: url
dtype: string
- name: id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2770732.853177051
num_examples: 1505
- name: test
num_bytes: 147281.4805675509
num_examples: 80
download_size: 1021547
dataset_size: 2918014.333744602
- config_name: yo
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 12281049.386416862
num_examples: 11220
- name: test
num_bytes: 646889.499765808
num_examples: 591
download_size: 4500391
dataset_size: 12927938.88618267
- config_name: zu
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5538797.656363636
num_examples: 8361
- name: test
num_bytes: 292143.25636363635
num_examples: 441
download_size: 2029609
dataset_size: 5830940.912727272
configs:
- config_name: am
data_files:
- split: train
path: am/train-*
- split: test
path: am/test-*
- config_name: ary
data_files:
- split: train
path: ary/train-*
- split: test
path: ary/test-*
- config_name: bm
data_files:
- split: train
path: bm/train-*
- split: test
path: bm/test-*
- config_name: ee
data_files:
- split: train
path: ee/train-*
- split: test
path: ee/test-*
- config_name: fon
data_files:
- split: train
path: fon/train-*
- split: test
path: fon/test-*
- config_name: ha
data_files:
- split: train
path: ha/train-*
- split: test
path: ha/test-*
- config_name: ig
data_files:
- split: train
path: ig/train-*
- split: test
path: ig/test-*
- config_name: lg
data_files:
- split: train
path: lg/train-*
- split: test
path: lg/test-*
- config_name: ln
data_files:
- split: train
path: ln/train-*
- split: test
path: ln/test-*
- config_name: ny
data_files:
- split: train
path: ny/train-*
- split: test
path: ny/test-*
- config_name: om
data_files:
- split: train
path: om/train-*
- split: test
path: om/test-*
- config_name: pcm
data_files:
- split: train
path: pcm/train-*
- split: test
path: pcm/test-*
- config_name: rn
data_files:
- split: train
path: rn/train-*
- split: test
path: rn/test-*
- config_name: rw
data_files:
- split: train
path: rw/train-*
- split: test
path: rw/test-*
- config_name: sn
data_files:
- split: train
path: sn/train-*
- split: test
path: sn/test-*
- config_name: so
data_files:
- split: train
path: so/train-*
- split: test
path: so/test-*
- config_name: sw
data_files:
- split: train
path: sw/train-*
- split: test
path: sw/test-*
- config_name: ti
data_files:
- split: train
path: ti/train-*
- split: test
path: ti/test-*
- config_name: tn
data_files:
- split: train
path: tn/train-*
- split: test
path: tn/test-*
- config_name: ts
data_files:
- split: train
path: ts/train-*
- split: test
path: ts/test-*
- config_name: tw
data_files:
- split: train
path: tw/train-*
- split: test
path: tw/test-*
- config_name: wo
data_files:
- split: train
path: wo/train-*
- split: test
path: wo/test-*
- config_name: yo
data_files:
- split: train
path: yo/train-*
- split: test
path: yo/test-*
- config_name: zu
data_files:
- split: train
path: zu/train-*
- split: test
path: zu/test-*
---
提供机构:
WikiQuality



