WikiQuality/unique_character_trigrams_lo
收藏Hugging Face2024-08-06 更新2024-06-29 收录
下载链接:
https://hf-mirror.com/datasets/WikiQuality/unique_character_trigrams_lo
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: am
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 19032652.73690671
num_examples: 10635
- name: test
num_bytes: 2115335.7343698856
num_examples: 1182
download_size: 5463788
dataset_size: 21147988.471276596
- config_name: ary
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7467063.340437788
num_examples: 5058
- name: test
num_bytes: 829673.7044930876
num_examples: 562
download_size: 2172124
dataset_size: 8296737.044930875
- config_name: ha
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 57495346.86711936
num_examples: 27098
- name: test
num_bytes: 6388607.624802435
num_examples: 3011
download_size: 21920933
dataset_size: 63883954.49192179
- config_name: ig
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 47381914.35744877
num_examples: 16603
- name: test
num_bytes: 5265291.3322588075
num_examples: 1845
download_size: 17614582
dataset_size: 52647205.68970758
- config_name: om
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2830780.3776223776
num_examples: 1583
- name: test
num_bytes: 314729.8461538461
num_examples: 176
download_size: 1030238
dataset_size: 3145510.2237762236
- config_name: pcm
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1279161.2766865927
num_examples: 853
- name: test
num_bytes: 142462.2758326217
num_examples: 95
download_size: 583167
dataset_size: 1421623.5525192143
- config_name: rw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8555104.288206743
num_examples: 5963
- name: test
num_bytes: 951204.7866981503
num_examples: 663
download_size: 3486788
dataset_size: 9506309.074904893
- config_name: sw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 51271218.611174345
num_examples: 48128
- name: test
num_bytes: 5697275.538824809
num_examples: 5348
download_size: 19483770
dataset_size: 56968494.14999916
- config_name: ti
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 615180.1512195122
num_examples: 362
- name: test
num_bytes: 69675.1
num_examples: 41
download_size: 199506
dataset_size: 684855.2512195122
- config_name: ts
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 623293.894952251
num_examples: 579
- name: test
num_bytes: 69972.5443383356
num_examples: 65
download_size: 240569
dataset_size: 693266.4392905866
- config_name: tw
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5681267.819867549
num_examples: 2915
- name: test
num_bytes: 631468.5329801325
num_examples: 324
download_size: 2092471
dataset_size: 6312736.352847682
- config_name: yo
features:
- name: id
dtype: string
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10681212.071240654
num_examples: 12223
- name: test
num_bytes: 1187578.1072417612
num_examples: 1359
download_size: 4188660
dataset_size: 11868790.178482415
configs:
- config_name: am
data_files:
- split: train
path: am/train-*
- split: test
path: am/test-*
- config_name: ary
data_files:
- split: train
path: ary/train-*
- split: test
path: ary/test-*
- config_name: ha
data_files:
- split: train
path: ha/train-*
- split: test
path: ha/test-*
- config_name: ig
data_files:
- split: train
path: ig/train-*
- split: test
path: ig/test-*
- config_name: om
data_files:
- split: train
path: om/train-*
- split: test
path: om/test-*
- config_name: pcm
data_files:
- split: train
path: pcm/train-*
- split: test
path: pcm/test-*
- config_name: rw
data_files:
- split: train
path: rw/train-*
- split: test
path: rw/test-*
- config_name: sw
data_files:
- split: train
path: sw/train-*
- split: test
path: sw/test-*
- config_name: ti
data_files:
- split: train
path: ti/train-*
- split: test
path: ti/test-*
- config_name: ts
data_files:
- split: train
path: ts/train-*
- split: test
path: ts/test-*
- config_name: tw
data_files:
- split: train
path: tw/train-*
- split: test
path: tw/test-*
- config_name: yo
data_files:
- split: train
path: yo/train-*
- split: test
path: yo/test-*
---
提供机构:
WikiQuality
原始信息汇总
数据集概述
配置信息
ha
- 特征:
- id: string
- url: string
- title: string
- text: string
- 分割:
- train:
- 字节数: 52116864.4253036
- 样本数: 24856
- test:
- 字节数: 2744648.1949115875
- 样本数: 1309
- train:
- 下载大小: 20733049
- 数据集大小: 54861512.620215185
- 数据文件路径:
- train: ha/train-*
- test: ha/test-*
ig
- 特征:
- id: string
- url: string
- title: string
- text: string
- 分割:
- train:
- 字节数: 46339376.52849003
- 样本数: 16267
- test:
- 字节数: 2441313.437321937
- 样本数: 857
- train:
- 下载大小: 16793803
- 数据集大小: 48780689.96581197
- 数据文件路径:
- train: ig/train-*
- test: ig/test-*
pcm
- 特征:
- id: string
- url: string
- title: string
- text: string
- 分割:
- train:
- 字节数: 1156227.0
- 样本数: 774
- test:
- 字节数: 61247.166666666664
- 样本数: 41
- train:
- 下载大小: 550910
- 数据集大小: 1217474.1666666667
- 数据文件路径:
- train: pcm/train-*
- test: pcm/test-*
sw
- 特征:
- id: string
- url: string
- title: string
- text: string
- 分割:
- train:
- 字节数: 52163212.20970572
- 样本数: 57292
- test:
- 字节数: 2746007.261475816
- 样本数: 3016
- train:
- 下载大小: 18771656
- 数据集大小: 54909219.471181534
- 数据文件路径:
- train: sw/train-*
- test: sw/test-*
yo
- 特征:
- id: string
- url: string
- title: string
- text: string
- 分割:
- train:
- 字节数: 3896929.941608566
- 样本数: 8269
- test:
- 字节数: 205473.63097609562
- 样本数: 436
- train:
- 下载大小: 3682307
- 数据集大小: 4102403.5725846617
- 数据文件路径:
- train: yo/train-*
- test: yo/test-*



