singletongue/wikipedia-utils
收藏Hugging Face2024-04-09 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/singletongue/wikipedia-utils
下载链接
链接失效反馈官方服务:
资源简介:
---
language:
- ja
license:
- cc-by-sa-3.0
- gfdl
size_categories:
- 10M<n<100M
dataset_info:
- config_name: corpus-jawiki-20230403
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 3569619848
num_examples: 24387500
download_size: 2147866840
dataset_size: 3569619848
- config_name: corpus-jawiki-20230403-cirrus
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 4779055224
num_examples: 28018607
download_size: 2829724501
dataset_size: 4779055224
- config_name: corpus-jawiki-20230403-filtered-large
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 3027074884
num_examples: 20133720
download_size: 1811952868
dataset_size: 3027074884
- config_name: corpus-jawiki-20240401
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 3746841610
num_examples: 25529795
download_size: 2252626876
dataset_size: 3746841610
- config_name: corpus-jawiki-20240401-cirrus
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 4890779202
num_examples: 28628755
download_size: 2894334977
dataset_size: 4890779202
- config_name: corpus-jawiki-20240401-filtered-large
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 3099640958
num_examples: 20555941
download_size: 1854398729
dataset_size: 3099640958
- config_name: paragraphs-jawiki-20230403
features:
- name: id
dtype: string
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: paragraph_index
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
- name: html_tag
dtype: string
splits:
- name: train
num_bytes: 4417130987
num_examples: 9668476
download_size: 2267871116
dataset_size: 4417130987
- config_name: paragraphs-jawiki-20240401
features:
- name: id
dtype: string
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: paragraph_index
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
- name: html_tag
dtype: string
splits:
- name: train
num_bytes: 4636360718
num_examples: 10144171
download_size: 2377450434
dataset_size: 4636360718
- config_name: passages-c300-jawiki-20230403
features:
- name: id
dtype: int64
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3939431360
num_examples: 6639833
download_size: 2195484941
dataset_size: 3939431360
- config_name: passages-c300-jawiki-20240401
features:
- name: id
dtype: int64
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4133902561
num_examples: 6947948
download_size: 2301413361
dataset_size: 4133902561
- config_name: passages-c400-jawiki-20230403
features:
- name: id
dtype: int64
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3868482519
num_examples: 5555583
download_size: 2184871432
dataset_size: 3868482519
- config_name: passages-c400-jawiki-20240401
features:
- name: id
dtype: int64
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4059226579
num_examples: 5807053
download_size: 2290149089
dataset_size: 4059226579
- config_name: passages-para-jawiki-20230403
features:
- name: id
dtype: int64
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3751418134
num_examples: 9397066
download_size: 2019697290
dataset_size: 3751418134
- config_name: passages-para-jawiki-20240401
features:
- name: id
dtype: int64
- name: pageid
dtype: int64
- name: revid
dtype: int64
- name: title
dtype: string
- name: section
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3933648342
num_examples: 9856972
download_size: 2115404463
dataset_size: 3933648342
configs:
- config_name: corpus-jawiki-20230403
data_files:
- split: train
path: corpus-jawiki-20230403/train-*
- config_name: corpus-jawiki-20230403-cirrus
data_files:
- split: train
path: corpus-jawiki-20230403-cirrus/train-*
- config_name: corpus-jawiki-20230403-filtered-large
data_files:
- split: train
path: corpus-jawiki-20230403-filtered-large/train-*
- config_name: corpus-jawiki-20240401
data_files:
- split: train
path: corpus-jawiki-20240401/train-*
- config_name: corpus-jawiki-20240401-cirrus
data_files:
- split: train
path: corpus-jawiki-20240401-cirrus/train-*
- config_name: corpus-jawiki-20240401-filtered-large
data_files:
- split: train
path: corpus-jawiki-20240401-filtered-large/train-*
- config_name: paragraphs-jawiki-20230403
data_files:
- split: train
path: paragraphs-jawiki-20230403/train-*
- config_name: paragraphs-jawiki-20240401
data_files:
- split: train
path: paragraphs-jawiki-20240401/train-*
- config_name: passages-c300-jawiki-20230403
data_files:
- split: train
path: passages-c300-jawiki-20230403/train-*
- config_name: passages-c300-jawiki-20240401
data_files:
- split: train
path: passages-c300-jawiki-20240401/train-*
- config_name: passages-c400-jawiki-20230403
data_files:
- split: train
path: passages-c400-jawiki-20230403/train-*
- config_name: passages-c400-jawiki-20240401
data_files:
- split: train
path: passages-c400-jawiki-20240401/train-*
- config_name: passages-para-jawiki-20230403
data_files:
- split: train
path: passages-para-jawiki-20230403/train-*
- config_name: passages-para-jawiki-20240401
data_files:
- split: train
path: passages-para-jawiki-20240401/train-*
---
# Wikipedia-Utils: Preprocessed Wikipedia Texts for NLP
Preprocessed Wikipedia texts generated with the scripts in [singletongue/wikipedia-utils](https://github.com/singletongue/wikipedia-utils) repo.
For detailed information on how the texts are processed, please refer to the repo.
提供机构:
singletongue
原始信息汇总
数据集概述
数据集语言
- 日语(ja)
许可证
- 创作共用署名-相同方式共享 3.0(cc-by-sa-3.0)
- GNU自由文档许可证(gfdl)
数据集大小
- 10M<n<100M
数据集配置详情
-
corpus-jawiki-20230403
- 特征:
- text (字符串)
- 分割:
- train
- 字节数: 3569619848
- 示例数: 24387500
- train
- 下载大小: 2147866840
- 数据集大小: 3569619848
- 特征:
-
corpus-jawiki-20230403-cirrus
- 特征:
- text (字符串)
- 分割:
- train
- 字节数: 4779055224
- 示例数: 28018607
- train
- 下载大小: 2829724501
- 数据集大小: 4779055224
- 特征:
-
corpus-jawiki-20230403-filtered-large
- 特征:
- text (字符串)
- 分割:
- train
- 字节数: 3027074884
- 示例数: 20133720
- train
- 下载大小: 1811952868
- 数据集大小: 3027074884
- 特征:
-
corpus-jawiki-20240401
- 特征:
- text (字符串)
- 分割:
- train
- 字节数: 3746841610
- 示例数: 25529795
- train
- 下载大小: 2252626876
- 数据集大小: 3746841610
- 特征:
-
corpus-jawiki-20240401-cirrus
- 特征:
- text (字符串)
- 分割:
- train
- 字节数: 4890779202
- 示例数: 28628755
- train
- 下载大小: 2894334977
- 数据集大小: 4890779202
- 特征:
-
corpus-jawiki-20240401-filtered-large
- 特征:
- text (字符串)
- 分割:
- train
- 字节数: 3099640958
- 示例数: 20555941
- train
- 下载大小: 1854398729
- 数据集大小: 3099640958
- 特征:
-
paragraphs-jawiki-20230403
- 特征:
- id (字符串)
- pageid (整数)
- revid (整数)
- paragraph_index (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- html_tag (字符串)
- 分割:
- train
- 字节数: 4417130987
- 示例数: 9668476
- train
- 下载大小: 2267871116
- 数据集大小: 4417130987
- 特征:
-
paragraphs-jawiki-20240401
- 特征:
- id (字符串)
- pageid (整数)
- revid (整数)
- paragraph_index (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- html_tag (字符串)
- 分割:
- train
- 字节数: 4636360718
- 示例数: 10144171
- train
- 下载大小: 2377450434
- 数据集大小: 4636360718
- 特征:
-
passages-c300-jawiki-20230403
- 特征:
- id (整数)
- pageid (整数)
- revid (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- 分割:
- train
- 字节数: 3939431360
- 示例数: 6639833
- train
- 下载大小: 2195484941
- 数据集大小: 3939431360
- 特征:
-
passages-c300-jawiki-20240401
- 特征:
- id (整数)
- pageid (整数)
- revid (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- 分割:
- train
- 字节数: 4133902561
- 示例数: 6947948
- train
- 下载大小: 2301413361
- 数据集大小: 4133902561
- 特征:
-
passages-c400-jawiki-20230403
- 特征:
- id (整数)
- pageid (整数)
- revid (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- 分割:
- train
- 字节数: 3868482519
- 示例数: 5555583
- train
- 下载大小: 2184871432
- 数据集大小: 3868482519
- 特征:
-
passages-c400-jawiki-20240401
- 特征:
- id (整数)
- pageid (整数)
- revid (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- 分割:
- train
- 字节数: 4059226579
- 示例数: 5807053
- train
- 下载大小: 2290149089
- 数据集大小: 4059226579
- 特征:
-
passages-para-jawiki-20230403
- 特征:
- id (整数)
- pageid (整数)
- revid (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- 分割:
- train
- 字节数: 3751418134
- 示例数: 9397066
- train
- 下载大小: 2019697290
- 数据集大小: 3751418134
- 特征:
-
passages-para-jawiki-20240401
- 特征:
- id (整数)
- pageid (整数)
- revid (整数)
- title (字符串)
- section (字符串)
- text (字符串)
- 分割:
- train
- 字节数: 3933648342
- 示例数: 9856972
- train
- 下载大小: 2115404463
- 数据集大小: 3933648342
- 特征:



