HeshamHaroon/arabic-turath-ocr
收藏Hugging Face2026-04-22 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/HeshamHaroon/arabic-turath-ocr
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: az_0000
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2149805
num_examples: 500
download_size: 1001313
dataset_size: 2149805
- config_name: az_0001
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2922805
num_examples: 500
download_size: 1464060
dataset_size: 2922805
- config_name: az_0002
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2915420
num_examples: 500
download_size: 1401371
dataset_size: 2915420
- config_name: az_0003
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2298269
num_examples: 500
download_size: 1104389
dataset_size: 2298269
- config_name: az_0004
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2108940
num_examples: 500
download_size: 989149
dataset_size: 2108940
- config_name: az_0005
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1960020
num_examples: 500
download_size: 907905
dataset_size: 1960020
- config_name: az_0006
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2085251
num_examples: 500
download_size: 1037152
dataset_size: 2085251
- config_name: az_0007
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2380241
num_examples: 500
download_size: 1220013
dataset_size: 2380241
- config_name: az_0008
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2652167
num_examples: 500
download_size: 1316951
dataset_size: 2652167
- config_name: az_0009
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2194555
num_examples: 500
download_size: 1055669
dataset_size: 2194555
- config_name: az_0010
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1776098
num_examples: 500
download_size: 830356
dataset_size: 1776098
- config_name: az_0011
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2364331
num_examples: 500
download_size: 1220125
dataset_size: 2364331
- config_name: az_0012
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2371915
num_examples: 500
download_size: 1198084
dataset_size: 2371915
- config_name: az_0013
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2300192
num_examples: 500
download_size: 1165491
dataset_size: 2300192
- config_name: az_0014
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2304144
num_examples: 500
download_size: 1118531
dataset_size: 2304144
- config_name: az_0015
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2367229
num_examples: 500
download_size: 1085413
dataset_size: 2367229
- config_name: az_0016
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2500685
num_examples: 500
download_size: 1192074
dataset_size: 2500685
- config_name: az_0017
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2396395
num_examples: 500
download_size: 1131342
dataset_size: 2396395
- config_name: az_0018
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2257066
num_examples: 500
download_size: 1080108
dataset_size: 2257066
- config_name: az_0019
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2288829
num_examples: 500
download_size: 1149823
dataset_size: 2288829
- config_name: az_0020
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2187761
num_examples: 500
download_size: 1095915
dataset_size: 2187761
- config_name: az_0021
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2288580
num_examples: 500
download_size: 1169409
dataset_size: 2288580
- config_name: az_0022
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2363844
num_examples: 500
download_size: 1184094
dataset_size: 2363844
- config_name: az_0023
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2162307
num_examples: 500
download_size: 1050723
dataset_size: 2162307
- config_name: az_0024
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2156109
num_examples: 500
download_size: 1002049
dataset_size: 2156109
- config_name: az_0025
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2234643
num_examples: 500
download_size: 1070173
dataset_size: 2234643
- config_name: az_0026
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2084480
num_examples: 500
download_size: 1015775
dataset_size: 2084480
- config_name: az_0027
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1713504
num_examples: 500
download_size: 806434
dataset_size: 1713504
- config_name: az_0028
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1918878
num_examples: 500
download_size: 931339
dataset_size: 1918878
- config_name: az_0029
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2009961
num_examples: 500
download_size: 979438
dataset_size: 2009961
- config_name: az_0030
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2164741
num_examples: 500
download_size: 1066491
dataset_size: 2164741
- config_name: az_0031
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2098778
num_examples: 500
download_size: 1010825
dataset_size: 2098778
- config_name: az_0032
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2014223
num_examples: 500
download_size: 937746
dataset_size: 2014223
- config_name: az_0033
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2173380
num_examples: 500
download_size: 1039837
dataset_size: 2173380
- config_name: az_0034
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2029270
num_examples: 500
download_size: 977628
dataset_size: 2029270
- config_name: az_0035
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1925830
num_examples: 500
download_size: 931888
dataset_size: 1925830
- config_name: az_0036
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1774747
num_examples: 500
download_size: 838692
dataset_size: 1774747
- config_name: az_0037
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2222887
num_examples: 500
download_size: 1037452
dataset_size: 2222887
- config_name: az_0038
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2109605
num_examples: 500
download_size: 1005508
dataset_size: 2109605
- config_name: az_0039
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2224339
num_examples: 500
download_size: 1068560
dataset_size: 2224339
- config_name: az_0040
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2190743
num_examples: 500
download_size: 1053180
dataset_size: 2190743
- config_name: az_0041
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2515221
num_examples: 500
download_size: 1236199
dataset_size: 2515221
- config_name: az_0042
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2202610
num_examples: 500
download_size: 1086911
dataset_size: 2202610
- config_name: az_0043
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2286509
num_examples: 500
download_size: 1117863
dataset_size: 2286509
- config_name: az_0044
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2422135
num_examples: 500
download_size: 1177178
dataset_size: 2422135
- config_name: az_0045
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2046043
num_examples: 500
download_size: 976134
dataset_size: 2046043
- config_name: az_0046
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2599821
num_examples: 500
download_size: 1316783
dataset_size: 2599821
- config_name: az_0047
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2744838
num_examples: 500
download_size: 1407474
dataset_size: 2744838
- config_name: az_0048
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2169131
num_examples: 500
download_size: 1048403
dataset_size: 2169131
- config_name: az_0049
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2156448
num_examples: 500
download_size: 995071
dataset_size: 2156448
- config_name: az_0050
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2039138
num_examples: 500
download_size: 938512
dataset_size: 2039138
- config_name: az_0051
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2043871
num_examples: 500
download_size: 938608
dataset_size: 2043871
- config_name: az_0052
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2171441
num_examples: 500
download_size: 1062062
dataset_size: 2171441
- config_name: az_0053
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2361178
num_examples: 500
download_size: 1172513
dataset_size: 2361178
- config_name: az_0054
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2362961
num_examples: 500
download_size: 1207552
dataset_size: 2362961
- config_name: az_0055
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2277198
num_examples: 500
download_size: 1137274
dataset_size: 2277198
- config_name: az_0056
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2438400
num_examples: 500
download_size: 1215277
dataset_size: 2438400
- config_name: az_0057
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2214714
num_examples: 500
download_size: 1084531
dataset_size: 2214714
- config_name: az_0058
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2055133
num_examples: 500
download_size: 1021928
dataset_size: 2055133
- config_name: az_0059
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2368761
num_examples: 500
download_size: 1227582
dataset_size: 2368761
- config_name: az_0060
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2019996
num_examples: 500
download_size: 1059909
dataset_size: 2019996
- config_name: az_0061
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1650844
num_examples: 500
download_size: 819493
dataset_size: 1650844
- config_name: az_0062
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2250081
num_examples: 500
download_size: 1141411
dataset_size: 2250081
- config_name: az_0063
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2065394
num_examples: 500
download_size: 1063117
dataset_size: 2065394
- config_name: az_0064
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1980662
num_examples: 500
download_size: 1034114
dataset_size: 1980662
- config_name: az_0065
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2048863
num_examples: 500
download_size: 1077893
dataset_size: 2048863
- config_name: az_0066
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2041032
num_examples: 500
download_size: 1068985
dataset_size: 2041032
- config_name: az_0067
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2143844
num_examples: 500
download_size: 1065293
dataset_size: 2143844
- config_name: az_0068
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2475869
num_examples: 500
download_size: 1222195
dataset_size: 2475869
- config_name: az_0069
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2692067
num_examples: 500
download_size: 1352353
dataset_size: 2692067
- config_name: az_0070
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2309778
num_examples: 500
download_size: 1205180
dataset_size: 2309778
- config_name: az_0071
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2463649
num_examples: 500
download_size: 1291881
dataset_size: 2463649
- config_name: az_0072
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2323142
num_examples: 500
download_size: 1174193
dataset_size: 2323142
- config_name: az_0073
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2064597
num_examples: 500
download_size: 1052398
dataset_size: 2064597
- config_name: az_0074
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1974281
num_examples: 500
download_size: 978334
dataset_size: 1974281
- config_name: az_0075
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1678446
num_examples: 500
download_size: 865125
dataset_size: 1678446
- config_name: az_0076
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1249524
num_examples: 500
download_size: 681147
dataset_size: 1249524
- config_name: az_0077
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1600356
num_examples: 500
download_size: 803701
dataset_size: 1600356
- config_name: az_0078
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1893135
num_examples: 500
download_size: 956181
dataset_size: 1893135
- config_name: az_0079
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2320054
num_examples: 500
download_size: 1177482
dataset_size: 2320054
- config_name: az_0080
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2698679
num_examples: 500
download_size: 1392484
dataset_size: 2698679
- config_name: az_0081
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2899288
num_examples: 500
download_size: 1538324
dataset_size: 2899288
- config_name: az_0082
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2731914
num_examples: 500
download_size: 1435894
dataset_size: 2731914
- config_name: az_0083
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2129787
num_examples: 500
download_size: 1089103
dataset_size: 2129787
- config_name: az_0084
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2484050
num_examples: 500
download_size: 1259404
dataset_size: 2484050
- config_name: az_0085
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1858172
num_examples: 500
download_size: 954510
dataset_size: 1858172
- config_name: az_0086
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2032553
num_examples: 500
download_size: 1024769
dataset_size: 2032553
- config_name: az_0087
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2441932
num_examples: 500
download_size: 1228729
dataset_size: 2441932
- config_name: az_0088
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2431228
num_examples: 500
download_size: 1225280
dataset_size: 2431228
- config_name: az_0089
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1748215
num_examples: 500
download_size: 866581
dataset_size: 1748215
- config_name: az_0090
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1814011
num_examples: 500
download_size: 904980
dataset_size: 1814011
- config_name: az_0091
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1835938
num_examples: 500
download_size: 931733
dataset_size: 1835938
- config_name: az_0092
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3087377
num_examples: 500
download_size: 1603205
dataset_size: 3087377
- config_name: az_0093
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3167666
num_examples: 500
download_size: 1663256
dataset_size: 3167666
- config_name: az_0094
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2691021
num_examples: 500
download_size: 1456920
dataset_size: 2691021
- config_name: az_0095
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1864215
num_examples: 500
download_size: 1033709
dataset_size: 1864215
- config_name: az_0096
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1983753
num_examples: 500
download_size: 1004495
dataset_size: 1983753
- config_name: az_0097
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3381242
num_examples: 500
download_size: 1755749
dataset_size: 3381242
- config_name: az_0098
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3438384
num_examples: 500
download_size: 1864182
dataset_size: 3438384
- config_name: az_0099
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3260118
num_examples: 500
download_size: 1697014
dataset_size: 3260118
- config_name: az_0100
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3295992
num_examples: 500
download_size: 1694601
dataset_size: 3295992
- config_name: az_0101
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2504729
num_examples: 500
download_size: 1282007
dataset_size: 2504729
- config_name: az_0102
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2209830
num_examples: 500
download_size: 1143101
dataset_size: 2209830
- config_name: az_0103
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2036418
num_examples: 500
download_size: 1056440
dataset_size: 2036418
- config_name: az_0104
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2128777
num_examples: 500
download_size: 1102405
dataset_size: 2128777
- config_name: az_0105
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2164505
num_examples: 500
download_size: 1119622
dataset_size: 2164505
- config_name: az_0106
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2155392
num_examples: 500
download_size: 1086914
dataset_size: 2155392
- config_name: az_0107
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2534702
num_examples: 500
download_size: 1262567
dataset_size: 2534702
- config_name: az_0108
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2598032
num_examples: 500
download_size: 1455657
dataset_size: 2598032
- config_name: az_0109
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2530839
num_examples: 500
download_size: 1422056
dataset_size: 2530839
- config_name: az_0110
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2540529
num_examples: 500
download_size: 1420482
dataset_size: 2540529
- config_name: az_0111
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2263060
num_examples: 500
download_size: 1242913
dataset_size: 2263060
- config_name: az_0112
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2263253
num_examples: 500
download_size: 1248177
dataset_size: 2263253
- config_name: az_0113
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2047611
num_examples: 500
download_size: 1065900
dataset_size: 2047611
- config_name: az_0114
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2003367
num_examples: 500
download_size: 1048269
dataset_size: 2003367
- config_name: az_0115
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2453524
num_examples: 500
download_size: 1237828
dataset_size: 2453524
- config_name: az_0116
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2042947
num_examples: 500
download_size: 1031133
dataset_size: 2042947
- config_name: az_0117
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2154676
num_examples: 500
download_size: 1087414
dataset_size: 2154676
- config_name: az_0118
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2370124
num_examples: 500
download_size: 1242235
dataset_size: 2370124
- config_name: az_0119
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1833768
num_examples: 500
download_size: 954923
dataset_size: 1833768
- config_name: az_0120
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1634186
num_examples: 500
download_size: 857038
dataset_size: 1634186
- config_name: az_0121
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2353410
num_examples: 500
download_size: 1211130
dataset_size: 2353410
- config_name: az_0122
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1627976
num_examples: 500
download_size: 852060
dataset_size: 1627976
- config_name: az_0123
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1735164
num_examples: 500
download_size: 870990
dataset_size: 1735164
- config_name: az_0124
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1461955
num_examples: 500
download_size: 726459
dataset_size: 1461955
- config_name: az_0125
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2289309
num_examples: 500
download_size: 1202752
dataset_size: 2289309
- config_name: az_0126
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2087622
num_examples: 500
download_size: 1120653
dataset_size: 2087622
- config_name: az_0127
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2040141
num_examples: 500
download_size: 1082785
dataset_size: 2040141
- config_name: az_0128
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2544684
num_examples: 500
download_size: 1342323
dataset_size: 2544684
- config_name: az_0129
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1880963
num_examples: 500
download_size: 940027
dataset_size: 1880963
- config_name: az_0130
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1645992
num_examples: 500
download_size: 844602
dataset_size: 1645992
- config_name: az_0131
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1762485
num_examples: 500
download_size: 917128
dataset_size: 1762485
- config_name: az_0132
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2926467
num_examples: 500
download_size: 1459517
dataset_size: 2926467
- config_name: az_0133
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2395619
num_examples: 500
download_size: 1261760
dataset_size: 2395619
- config_name: az_0134
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2375241
num_examples: 500
download_size: 1242788
dataset_size: 2375241
- config_name: az_0135
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1919966
num_examples: 500
download_size: 1015239
dataset_size: 1919966
- config_name: az_0136
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2732802
num_examples: 500
download_size: 1389265
dataset_size: 2732802
- config_name: az_0137
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3739484
num_examples: 500
download_size: 1914425
dataset_size: 3739484
- config_name: az_0138
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 4069010
num_examples: 500
download_size: 2126641
dataset_size: 4069010
- config_name: az_0139
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 4525487
num_examples: 500
download_size: 2203416
dataset_size: 4525487
- config_name: az_0140
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 4136632
num_examples: 500
download_size: 1968853
dataset_size: 4136632
- config_name: az_0141
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3336030
num_examples: 500
download_size: 1708966
dataset_size: 3336030
- config_name: az_0142
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2448051
num_examples: 500
download_size: 1390513
dataset_size: 2448051
- config_name: az_0143
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1863489
num_examples: 500
download_size: 1021110
dataset_size: 1863489
- config_name: az_0144
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1352662
num_examples: 500
download_size: 745221
dataset_size: 1352662
- config_name: az_0145
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2351271
num_examples: 500
download_size: 1286481
dataset_size: 2351271
- config_name: az_0146
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2699311
num_examples: 500
download_size: 1502192
dataset_size: 2699311
- config_name: az_0147
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2769821
num_examples: 500
download_size: 1551020
dataset_size: 2769821
- config_name: az_0148
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1932291
num_examples: 500
download_size: 993925
dataset_size: 1932291
- config_name: az_0149
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1874316
num_examples: 500
download_size: 941724
dataset_size: 1874316
- config_name: az_0150
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2117301
num_examples: 500
download_size: 916980
dataset_size: 2117301
- config_name: az_0151
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2416875
num_examples: 500
download_size: 1027054
dataset_size: 2416875
- config_name: az_0152
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2363614
num_examples: 500
download_size: 1031947
dataset_size: 2363614
- config_name: az_0153
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2279375
num_examples: 500
download_size: 990487
dataset_size: 2279375
- config_name: az_0154
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2463475
num_examples: 500
download_size: 1112817
dataset_size: 2463475
- config_name: az_0155
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2607502
num_examples: 500
download_size: 1187384
dataset_size: 2607502
- config_name: az_0156
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2805417
num_examples: 500
download_size: 1250707
dataset_size: 2805417
- config_name: az_0157
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2717619
num_examples: 500
download_size: 1201081
dataset_size: 2717619
- config_name: az_0158
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2671668
num_examples: 500
download_size: 1133605
dataset_size: 2671668
- config_name: az_0159
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2279994
num_examples: 500
download_size: 947428
dataset_size: 2279994
- config_name: az_0160
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2402643
num_examples: 500
download_size: 1054595
dataset_size: 2402643
- config_name: az_0161
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2574431
num_examples: 500
download_size: 1240916
dataset_size: 2574431
- config_name: az_0162
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2840402
num_examples: 500
download_size: 1493012
dataset_size: 2840402
- config_name: az_0163
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2729939
num_examples: 500
download_size: 1429219
dataset_size: 2729939
- config_name: az_0164
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2620145
num_examples: 500
download_size: 1345836
dataset_size: 2620145
- config_name: az_0165
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2708483
num_examples: 500
download_size: 1432508
dataset_size: 2708483
- config_name: az_0166
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2669739
num_examples: 500
download_size: 1403597
dataset_size: 2669739
- config_name: az_0167
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2626912
num_examples: 500
download_size: 1388457
dataset_size: 2626912
- config_name: az_0168
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2747506
num_examples: 500
download_size: 1460048
dataset_size: 2747506
- config_name: az_0169
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2697308
num_examples: 500
download_size: 1423776
dataset_size: 2697308
- config_name: az_0170
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2516395
num_examples: 500
download_size: 1325817
dataset_size: 2516395
- config_name: az_0171
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2578672
num_examples: 500
download_size: 1338182
dataset_size: 2578672
- config_name: az_0172
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2601975
num_examples: 500
download_size: 1342627
dataset_size: 2601975
- config_name: az_0173
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2440403
num_examples: 500
download_size: 1255250
dataset_size: 2440403
- config_name: az_0174
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2410654
num_examples: 500
download_size: 1208341
dataset_size: 2410654
- config_name: az_0175
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2289461
num_examples: 500
download_size: 1124516
dataset_size: 2289461
- config_name: az_0176
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2143245
num_examples: 500
download_size: 1063290
dataset_size: 2143245
- config_name: az_0177
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2838249
num_examples: 500
download_size: 1462552
dataset_size: 2838249
- config_name: az_0178
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2929987
num_examples: 500
download_size: 1528636
dataset_size: 2929987
- config_name: az_0179
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2974447
num_examples: 500
download_size: 1561502
dataset_size: 2974447
- config_name: az_0180
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3055645
num_examples: 500
download_size: 1607154
dataset_size: 3055645
- config_name: az_0181
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2966216
num_examples: 500
download_size: 1528954
dataset_size: 2966216
- config_name: az_0182
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2459525
num_examples: 500
download_size: 1199849
dataset_size: 2459525
- config_name: az_0183
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2306168
num_examples: 500
download_size: 1157239
dataset_size: 2306168
- config_name: az_0184
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2480562
num_examples: 500
download_size: 1233392
dataset_size: 2480562
- config_name: az_0185
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2335776
num_examples: 500
download_size: 1155336
dataset_size: 2335776
- config_name: az_0186
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2439873
num_examples: 500
download_size: 1236093
dataset_size: 2439873
- config_name: az_0187
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2305264
num_examples: 500
download_size: 1145613
dataset_size: 2305264
- config_name: az_0188
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2444564
num_examples: 500
download_size: 1204165
dataset_size: 2444564
- config_name: az_0189
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2364935
num_examples: 500
download_size: 1192001
dataset_size: 2364935
- config_name: az_0190
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2531537
num_examples: 500
download_size: 1292806
dataset_size: 2531537
- config_name: az_0191
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2322411
num_examples: 500
download_size: 1143635
dataset_size: 2322411
- config_name: az_0192
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1963201
num_examples: 500
download_size: 942563
dataset_size: 1963201
- config_name: az_0193
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1762659
num_examples: 500
download_size: 894043
dataset_size: 1762659
- config_name: az_0194
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1163306
num_examples: 500
download_size: 608797
dataset_size: 1163306
- config_name: az_0195
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2321721
num_examples: 500
download_size: 1169425
dataset_size: 2321721
- config_name: az_0196
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2178941
num_examples: 500
download_size: 1099618
dataset_size: 2178941
- config_name: az_0197
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2325151
num_examples: 500
download_size: 1163978
dataset_size: 2325151
- config_name: az_0198
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2494676
num_examples: 500
download_size: 1207515
dataset_size: 2494676
- config_name: az_0199
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2534971
num_examples: 500
download_size: 1223200
dataset_size: 2534971
- config_name: az_0200
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2397473
num_examples: 500
download_size: 1268103
dataset_size: 2397473
- config_name: az_0201
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2714046
num_examples: 500
download_size: 1405538
dataset_size: 2714046
- config_name: az_0202
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2180822
num_examples: 500
download_size: 1111003
dataset_size: 2180822
- config_name: az_0203
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2238850
num_examples: 500
download_size: 1086414
dataset_size: 2238850
- config_name: az_0204
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2117538
num_examples: 500
download_size: 1058718
dataset_size: 2117538
- config_name: az_0205
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2257238
num_examples: 500
download_size: 1117018
dataset_size: 2257238
- config_name: az_0206
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2640253
num_examples: 500
download_size: 1350440
dataset_size: 2640253
- config_name: az_0207
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3167338
num_examples: 500
download_size: 1641233
dataset_size: 3167338
- config_name: az_0208
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3198357
num_examples: 500
download_size: 1651784
dataset_size: 3198357
- config_name: az_0209
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3029910
num_examples: 500
download_size: 1534713
dataset_size: 3029910
- config_name: az_0210
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3060998
num_examples: 500
download_size: 1557913
dataset_size: 3060998
- config_name: az_0211
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2501678
num_examples: 500
download_size: 1300652
dataset_size: 2501678
- config_name: az_0212
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2450150
num_examples: 500
download_size: 1297687
dataset_size: 2450150
- config_name: az_0213
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2461482
num_examples: 500
download_size: 1292412
dataset_size: 2461482
- config_name: az_0214
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2492459
num_examples: 500
download_size: 1314999
dataset_size: 2492459
- config_name: az_0215
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2820654
num_examples: 500
download_size: 1435278
dataset_size: 2820654
- config_name: az_0216
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2927521
num_examples: 500
download_size: 1460446
dataset_size: 2927521
- config_name: az_0217
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3035491
num_examples: 500
download_size: 1560436
dataset_size: 3035491
- config_name: az_0218
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2970115
num_examples: 500
download_size: 1476395
dataset_size: 2970115
- config_name: az_0219
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2338644
num_examples: 500
download_size: 1185797
dataset_size: 2338644
- config_name: az_0220
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 1775520
num_examples: 500
download_size: 885267
dataset_size: 1775520
- config_name: az_0221
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2111965
num_examples: 500
download_size: 992232
dataset_size: 2111965
- config_name: az_0222
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2368454
num_examples: 500
download_size: 1284273
dataset_size: 2368454
- config_name: az_0223
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2441144
num_examples: 500
download_size: 1321910
dataset_size: 2441144
- config_name: az_0224
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2582544
num_examples: 500
download_size: 1379843
dataset_size: 2582544
- config_name: az_0225
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2809231
num_examples: 500
download_size: 1457441
dataset_size: 2809231
- config_name: az_0226
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3138888
num_examples: 500
download_size: 1680676
dataset_size: 3138888
- config_name: az_0227
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2659631
num_examples: 500
download_size: 1422091
dataset_size: 2659631
- config_name: az_0228
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2674030
num_examples: 500
download_size: 1401449
dataset_size: 2674030
- config_name: az_0229
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2339652
num_examples: 500
download_size: 1206659
dataset_size: 2339652
- config_name: az_0230
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2387105
num_examples: 500
download_size: 1219624
dataset_size: 2387105
- config_name: az_0231
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2469454
num_examples: 500
download_size: 1273749
dataset_size: 2469454
- config_name: az_0232
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2425458
num_examples: 500
download_size: 1223132
dataset_size: 2425458
- config_name: az_0233
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2556456
num_examples: 500
download_size: 1351866
dataset_size: 2556456
- config_name: az_0234
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2383846
num_examples: 500
download_size: 1249054
dataset_size: 2383846
- config_name: az_0235
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2221236
num_examples: 500
download_size: 1183302
dataset_size: 2221236
- config_name: az_0236
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2737397
num_examples: 500
download_size: 1399849
dataset_size: 2737397
- config_name: az_0237
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2756784
num_examples: 500
download_size: 1413563
dataset_size: 2756784
- config_name: az_0238
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2545164
num_examples: 500
download_size: 1314620
dataset_size: 2545164
- config_name: az_0239
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2534525
num_examples: 500
download_size: 1265273
dataset_size: 2534525
- config_name: az_0240
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2557606
num_examples: 500
download_size: 1287762
dataset_size: 2557606
- config_name: az_0241
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2581162
num_examples: 500
download_size: 1293003
dataset_size: 2581162
- config_name: az_0242
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2564283
num_examples: 500
download_size: 1325571
dataset_size: 2564283
- config_name: az_0243
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2696182
num_examples: 500
download_size: 1348648
dataset_size: 2696182
- config_name: az_0244
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2810497
num_examples: 500
download_size: 1388259
dataset_size: 2810497
- config_name: az_0245
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2810426
num_examples: 500
download_size: 1387071
dataset_size: 2810426
- config_name: az_0246
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2965042
num_examples: 500
download_size: 1501217
dataset_size: 2965042
- config_name: az_0247
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 3040767
num_examples: 500
download_size: 1531004
dataset_size: 3040767
- config_name: az_0248
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2696065
num_examples: 500
download_size: 1353977
dataset_size: 2696065
- config_name: az_0249
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2489535
num_examples: 500
download_size: 1238811
dataset_size: 2489535
- config_name: az_0250
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2753762
num_examples: 500
download_size: 1377930
dataset_size: 2753762
- config_name: az_0251
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2241654
num_examples: 500
download_size: 1120150
dataset_size: 2241654
- config_name: az_0252
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2011569
num_examples: 500
download_size: 1028569
dataset_size: 2011569
- config_name: az_0253
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2036495
num_examples: 500
download_size: 1033915
dataset_size: 2036495
- config_name: az_0254
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2327692
num_examples: 500
download_size: 1146201
dataset_size: 2327692
- config_name: az_0255
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 2828003
num_examples: 500
download_size: 1336630
dataset_size: 2828003
- config_name: az_0256
features:
- name: item_id
dtype: string
- name: item_title
dtype: string
- name: book
dtype: string
- name: pdf_filename
dtype: string
- name: page
dtype: int64
- name: total_pages
dtype: int64
- name: text
dtype: string
- name: raw_text
dtype: string
- name: char_count
dtype: int64
- name: arabic_ratio
dtype: float64
- name: di_confidence
dtype: float64
- name: clean_model
dtype: string
- name: cleaned
dtype: bool
- name: error
dtype: 'null'
- name: pdf_url
dtype: string
- name: source_detail_url
dtype: string
splits:
- name: train
num_bytes: 664400
num_examples: 105
download_size: 334568
dataset_size: 664400
configs:
- config_name: az_0000
data_files:
- split: train
path: az_0000/train-*
- config_name: az_0001
data_files:
- split: train
path: az_0001/train-*
- config_name: az_0002
data_files:
- split: train
path: az_0002/train-*
- config_name: az_0003
data_files:
- split: train
path: az_0003/train-*
- config_name: az_0004
data_files:
- split: train
path: az_0004/train-*
- config_name: az_0005
data_files:
- split: train
path: az_0005/train-*
- config_name: az_0006
data_files:
- split: train
path: az_0006/train-*
- config_name: az_0007
data_files:
- split: train
path: az_0007/train-*
- config_name: az_0008
data_files:
- split: train
path: az_0008/train-*
- config_name: az_0009
data_files:
- split: train
path: az_0009/train-*
- config_name: az_0010
data_files:
- split: train
path: az_0010/train-*
- config_name: az_0011
data_files:
- split: train
path: az_0011/train-*
- config_name: az_0012
data_files:
- split: train
path: az_0012/train-*
- config_name: az_0013
data_files:
- split: train
path: az_0013/train-*
- config_name: az_0014
data_files:
- split: train
path: az_0014/train-*
- config_name: az_0015
data_files:
- split: train
path: az_0015/train-*
- config_name: az_0016
data_files:
- split: train
path: az_0016/train-*
- config_name: az_0017
data_files:
- split: train
path: az_0017/train-*
- config_name: az_0018
data_files:
- split: train
path: az_0018/train-*
- config_name: az_0019
data_files:
- split: train
path: az_0019/train-*
- config_name: az_0020
data_files:
- split: train
path: az_0020/train-*
- config_name: az_0021
data_files:
- split: train
path: az_0021/train-*
- config_name: az_0022
data_files:
- split: train
path: az_0022/train-*
- config_name: az_0023
data_files:
- split: train
path: az_0023/train-*
- config_name: az_0024
data_files:
- split: train
path: az_0024/train-*
- config_name: az_0025
data_files:
- split: train
path: az_0025/train-*
- config_name: az_0026
data_files:
- split: train
path: az_0026/train-*
- config_name: az_0027
data_files:
- split: train
path: az_0027/train-*
- config_name: az_0028
data_files:
- split: train
path: az_0028/train-*
- config_name: az_0029
data_files:
- split: train
path: az_0029/train-*
- config_name: az_0030
data_files:
- split: train
path: az_0030/train-*
- config_name: az_0031
data_files:
- split: train
path: az_0031/train-*
- config_name: az_0032
data_files:
- split: train
path: az_0032/train-*
- config_name: az_0033
data_files:
- split: train
path: az_0033/train-*
- config_name: az_0034
data_files:
- split: train
path: az_0034/train-*
- config_name: az_0035
data_files:
- split: train
path: az_0035/train-*
- config_name: az_0036
data_files:
- split: train
path: az_0036/train-*
- config_name: az_0037
data_files:
- split: train
path: az_0037/train-*
- config_name: az_0038
data_files:
- split: train
path: az_0038/train-*
- config_name: az_0039
data_files:
- split: train
path: az_0039/train-*
- config_name: az_0040
data_files:
- split: train
path: az_0040/train-*
- config_name: az_0041
data_files:
- split: train
path: az_0041/train-*
- config_name: az_0042
data_files:
- split: train
path: az_0042/train-*
- config_name: az_0043
data_files:
- split: train
path: az_0043/train-*
- config_name: az_0044
data_files:
- split: train
path: az_0044/train-*
- config_name: az_0045
data_files:
- split: train
path: az_0045/train-*
- config_name: az_0046
data_files:
- split: train
path: az_0046/train-*
- config_name: az_0047
data_files:
- split: train
path: az_0047/train-*
- config_name: az_0048
data_files:
- split: train
path: az_0048/train-*
- config_name: az_0049
data_files:
- split: train
path: az_0049/train-*
- config_name: az_0050
data_files:
- split: train
path: az_0050/train-*
- config_name: az_0051
data_files:
- split: train
path: az_0051/train-*
- config_name: az_0052
data_files:
- split: train
path: az_0052/train-*
- config_name: az_0053
data_files:
- split: train
path: az_0053/train-*
- config_name: az_0054
data_files:
- split: train
path: az_0054/train-*
- config_name: az_0055
data_files:
- split: train
path: az_0055/train-*
- config_name: az_0056
data_files:
- split: train
path: az_0056/train-*
- config_name: az_0057
data_files:
- split: train
path: az_0057/train-*
- config_name: az_0058
data_files:
- split: train
path: az_0058/train-*
- config_name: az_0059
data_files:
- split: train
path: az_0059/train-*
- config_name: az_0060
data_files:
- split: train
path: az_0060/train-*
- config_name: az_0061
data_files:
- split: train
path: az_0061/train-*
- config_name: az_0062
data_files:
- split: train
path: az_0062/train-*
- config_name: az_0063
data_files:
- split: train
path: az_0063/train-*
- config_name: az_0064
data_files:
- split: train
path: az_0064/train-*
- config_name: az_0065
data_files:
- split: train
path: az_0065/train-*
- config_name: az_0066
data_files:
- split: train
path: az_0066/train-*
- config_name: az_0067
data_files:
- split: train
path: az_0067/train-*
- config_name: az_0068
data_files:
- split: train
path: az_0068/train-*
- config_name: az_0069
data_files:
- split: train
path: az_0069/train-*
- config_name: az_0070
data_files:
- split: train
path: az_0070/train-*
- config_name: az_0071
data_files:
- split: train
path: az_0071/train-*
- config_name: az_0072
data_files:
- split: train
path: az_0072/train-*
- config_name: az_0073
data_files:
- split: train
path: az_0073/train-*
- config_name: az_0074
data_files:
- split: train
path: az_0074/train-*
- config_name: az_0075
data_files:
- split: train
path: az_0075/train-*
- config_name: az_0076
data_files:
- split: train
path: az_0076/train-*
- config_name: az_0077
data_files:
- split: train
path: az_0077/train-*
- config_name: az_0078
data_files:
- split: train
path: az_0078/train-*
- config_name: az_0079
data_files:
- split: train
path: az_0079/train-*
- config_name: az_0080
data_files:
- split: train
path: az_0080/train-*
- config_name: az_0081
data_files:
- split: train
path: az_0081/train-*
- config_name: az_0082
data_files:
- split: train
path: az_0082/train-*
- config_name: az_0083
data_files:
- split: train
path: az_0083/train-*
- config_name: az_0084
data_files:
- split: train
path: az_0084/train-*
- config_name: az_0085
data_files:
- split: train
path: az_0085/train-*
- config_name: az_0086
data_files:
- split: train
path: az_0086/train-*
- config_name: az_0087
data_files:
- split: train
path: az_0087/train-*
- config_name: az_0088
data_files:
- split: train
path: az_0088/train-*
- config_name: az_0089
data_files:
- split: train
path: az_0089/train-*
- config_name: az_0090
data_files:
- split: train
path: az_0090/train-*
- config_name: az_0091
data_files:
- split: train
path: az_0091/train-*
- config_name: az_0092
data_files:
- split: train
path: az_0092/train-*
- config_name: az_0093
data_files:
- split: train
path: az_0093/train-*
- config_name: az_0094
data_files:
- split: train
path: az_0094/train-*
- config_name: az_0095
data_files:
- split: train
path: az_0095/train-*
- config_name: az_0096
data_files:
- split: train
path: az_0096/train-*
- config_name: az_0097
data_files:
- split: train
path: az_0097/train-*
- config_name: az_0098
data_files:
- split: train
path: az_0098/train-*
- config_name: az_0099
data_files:
- split: train
path: az_0099/train-*
- config_name: az_0100
data_files:
- split: train
path: az_0100/train-*
- config_name: az_0101
data_files:
- split: train
path: az_0101/train-*
- config_name: az_0102
data_files:
- split: train
path: az_0102/train-*
- config_name: az_0103
data_files:
- split: train
path: az_0103/train-*
- config_name: az_0104
data_files:
- split: train
path: az_0104/train-*
- config_name: az_0105
data_files:
- split: train
path: az_0105/train-*
- config_name: az_0106
data_files:
- split: train
path: az_0106/train-*
- config_name: az_0107
data_files:
- split: train
path: az_0107/train-*
- config_name: az_0108
data_files:
- split: train
path: az_0108/train-*
- config_name: az_0109
data_files:
- split: train
path: az_0109/train-*
- config_name: az_0110
data_files:
- split: train
path: az_0110/train-*
- config_name: az_0111
data_files:
- split: train
path: az_0111/train-*
- config_name: az_0112
data_files:
- split: train
path: az_0112/train-*
- config_name: az_0113
data_files:
- split: train
path: az_0113/train-*
- config_name: az_0114
data_files:
- split: train
path: az_0114/train-*
- config_name: az_0115
data_files:
- split: train
path: az_0115/train-*
- config_name: az_0116
data_files:
- split: train
path: az_0116/train-*
- config_name: az_0117
data_files:
- split: train
path: az_0117/train-*
- config_name: az_0118
data_files:
- split: train
path: az_0118/train-*
- config_name: az_0119
data_files:
- split: train
path: az_0119/train-*
- config_name: az_0120
data_files:
- split: train
path: az_0120/train-*
- config_name: az_0121
data_files:
- split: train
path: az_0121/train-*
- config_name: az_0122
data_files:
- split: train
path: az_0122/train-*
- config_name: az_0123
data_files:
- split: train
path: az_0123/train-*
- config_name: az_0124
data_files:
- split: train
path: az_0124/train-*
- config_name: az_0125
data_files:
- split: train
path: az_0125/train-*
- config_name: az_0126
data_files:
- split: train
path: az_0126/train-*
- config_name: az_0127
data_files:
- split: train
path: az_0127/train-*
- config_name: az_0128
data_files:
- split: train
path: az_0128/train-*
- config_name: az_0129
data_files:
- split: train
path: az_0129/train-*
- config_name: az_0130
data_files:
- split: train
path: az_0130/train-*
- config_name: az_0131
data_files:
- split: train
path: az_0131/train-*
- config_name: az_0132
data_files:
- split: train
path: az_0132/train-*
- config_name: az_0133
data_files:
- split: train
path: az_0133/train-*
- config_name: az_0134
data_files:
- split: train
path: az_0134/train-*
- config_name: az_0135
data_files:
- split: train
path: az_0135/train-*
- config_name: az_0136
data_files:
- split: train
path: az_0136/train-*
- config_name: az_0137
data_files:
- split: train
path: az_0137/train-*
- config_name: az_0138
data_files:
- split: train
path: az_0138/train-*
- config_name: az_0139
data_files:
- split: train
path: az_0139/train-*
- config_name: az_0140
data_files:
- split: train
path: az_0140/train-*
- config_name: az_0141
data_files:
- split: train
path: az_0141/train-*
- config_name: az_0142
data_files:
- split: train
path: az_0142/train-*
- config_name: az_0143
data_files:
- split: train
path: az_0143/train-*
- config_name: az_0144
data_files:
- split: train
path: az_0144/train-*
- config_name: az_0145
data_files:
- split: train
path: az_0145/train-*
- config_name: az_0146
data_files:
- split: train
path: az_0146/train-*
- config_name: az_0147
data_files:
- split: train
path: az_0147/train-*
- config_name: az_0148
data_files:
- split: train
path: az_0148/train-*
- config_name: az_0149
data_files:
- split: train
path: az_0149/train-*
- config_name: az_0150
data_files:
- split: train
path: az_0150/train-*
- config_name: az_0151
data_files:
- split: train
path: az_0151/train-*
- config_name: az_0152
data_files:
- split: train
path: az_0152/train-*
- config_name: az_0153
data_files:
- split: train
path: az_0153/train-*
- config_name: az_0154
data_files:
- split: train
path: az_0154/train-*
- config_name: az_0155
data_files:
- split: train
path: az_0155/train-*
- config_name: az_0156
data_files:
- split: train
path: az_0156/train-*
- config_name: az_0157
data_files:
- split: train
path: az_0157/train-*
- config_name: az_0158
data_files:
- split: train
path: az_0158/train-*
- config_name: az_0159
data_files:
- split: train
path: az_0159/train-*
- config_name: az_0160
data_files:
- split: train
path: az_0160/train-*
- config_name: az_0161
data_files:
- split: train
path: az_0161/train-*
- config_name: az_0162
data_files:
- split: train
path: az_0162/train-*
- config_name: az_0163
data_files:
- split: train
path: az_0163/train-*
- config_name: az_0164
data_files:
- split: train
path: az_0164/train-*
- config_name: az_0165
data_files:
- split: train
path: az_0165/train-*
- config_name: az_0166
data_files:
- split: train
path: az_0166/train-*
- config_name: az_0167
data_files:
- split: train
path: az_0167/train-*
- config_name: az_0168
data_files:
- split: train
path: az_0168/train-*
- config_name: az_0169
data_files:
- split: train
path: az_0169/train-*
- config_name: az_0170
data_files:
- split: train
path: az_0170/train-*
- config_name: az_0171
data_files:
- split: train
path: az_0171/train-*
- config_name: az_0172
data_files:
- split: train
path: az_0172/train-*
- config_name: az_0173
data_files:
- split: train
path: az_0173/train-*
- config_name: az_0174
data_files:
- split: train
path: az_0174/train-*
- config_name: az_0175
data_files:
- split: train
path: az_0175/train-*
- config_name: az_0176
data_files:
- split: train
path: az_0176/train-*
- config_name: az_0177
data_files:
- split: train
path: az_0177/train-*
- config_name: az_0178
data_files:
- split: train
path: az_0178/train-*
- config_name: az_0179
data_files:
- split: train
path: az_0179/train-*
- config_name: az_0180
data_files:
- split: train
path: az_0180/train-*
- config_name: az_0181
data_files:
- split: train
path: az_0181/train-*
- config_name: az_0182
data_files:
- split: train
path: az_0182/train-*
- config_name: az_0183
data_files:
- split: train
path: az_0183/train-*
- config_name: az_0184
data_files:
- split: train
path: az_0184/train-*
- config_name: az_0185
data_files:
- split: train
path: az_0185/train-*
- config_name: az_0186
data_files:
- split: train
path: az_0186/train-*
- config_name: az_0187
data_files:
- split: train
path: az_0187/train-*
- config_name: az_0188
data_files:
- split: train
path: az_0188/train-*
- config_name: az_0189
data_files:
- split: train
path: az_0189/train-*
- config_name: az_0190
data_files:
- split: train
path: az_0190/train-*
- config_name: az_0191
data_files:
- split: train
path: az_0191/train-*
- config_name: az_0192
data_files:
- split: train
path: az_0192/train-*
- config_name: az_0193
data_files:
- split: train
path: az_0193/train-*
- config_name: az_0194
data_files:
- split: train
path: az_0194/train-*
- config_name: az_0195
data_files:
- split: train
path: az_0195/train-*
- config_name: az_0196
data_files:
- split: train
path: az_0196/train-*
- config_name: az_0197
data_files:
- split: train
path: az_0197/train-*
- config_name: az_0198
data_files:
- split: train
path: az_0198/train-*
- config_name: az_0199
data_files:
- split: train
path: az_0199/train-*
- config_name: az_0200
data_files:
- split: train
path: az_0200/train-*
- config_name: az_0201
data_files:
- split: train
path: az_0201/train-*
- config_name: az_0202
data_files:
- split: train
path: az_0202/train-*
- config_name: az_0203
data_files:
- split: train
path: az_0203/train-*
- config_name: az_0204
data_files:
- split: train
path: az_0204/train-*
- config_name: az_0205
data_files:
- split: train
path: az_0205/train-*
- config_name: az_0206
data_files:
- split: train
path: az_0206/train-*
- config_name: az_0207
data_files:
- split: train
path: az_0207/train-*
- config_name: az_0208
data_files:
- split: train
path: az_0208/train-*
- config_name: az_0209
data_files:
- split: train
path: az_0209/train-*
- config_name: az_0210
data_files:
- split: train
path: az_0210/train-*
- config_name: az_0211
data_files:
- split: train
path: az_0211/train-*
- config_name: az_0212
data_files:
- split: train
path: az_0212/train-*
- config_name: az_0213
data_files:
- split: train
path: az_0213/train-*
- config_name: az_0214
data_files:
- split: train
path: az_0214/train-*
- config_name: az_0215
data_files:
- split: train
path: az_0215/train-*
- config_name: az_0216
data_files:
- split: train
path: az_0216/train-*
- config_name: az_0217
data_files:
- split: train
path: az_0217/train-*
- config_name: az_0218
data_files:
- split: train
path: az_0218/train-*
- config_name: az_0219
data_files:
- split: train
path: az_0219/train-*
- config_name: az_0220
data_files:
- split: train
path: az_0220/train-*
- config_name: az_0221
data_files:
- split: train
path: az_0221/train-*
- config_name: az_0222
data_files:
- split: train
path: az_0222/train-*
- config_name: az_0223
data_files:
- split: train
path: az_0223/train-*
- config_name: az_0224
data_files:
- split: train
path: az_0224/train-*
- config_name: az_0225
data_files:
- split: train
path: az_0225/train-*
- config_name: az_0226
data_files:
- split: train
path: az_0226/train-*
- config_name: az_0227
data_files:
- split: train
path: az_0227/train-*
- config_name: az_0228
data_files:
- split: train
path: az_0228/train-*
- config_name: az_0229
data_files:
- split: train
path: az_0229/train-*
- config_name: az_0230
data_files:
- split: train
path: az_0230/train-*
- config_name: az_0231
data_files:
- split: train
path: az_0231/train-*
- config_name: az_0232
data_files:
- split: train
path: az_0232/train-*
- config_name: az_0233
data_files:
- split: train
path: az_0233/train-*
- config_name: az_0234
data_files:
- split: train
path: az_0234/train-*
- config_name: az_0235
data_files:
- split: train
path: az_0235/train-*
- config_name: az_0236
data_files:
- split: train
path: az_0236/train-*
- config_name: az_0237
data_files:
- split: train
path: az_0237/train-*
- config_name: az_0238
data_files:
- split: train
path: az_0238/train-*
- config_name: az_0239
data_files:
- split: train
path: az_0239/train-*
- config_name: az_0240
data_files:
- split: train
path: az_0240/train-*
- config_name: az_0241
data_files:
- split: train
path: az_0241/train-*
- config_name: az_0242
data_files:
- split: train
path: az_0242/train-*
- config_name: az_0243
data_files:
- split: train
path: az_0243/train-*
- config_name: az_0244
data_files:
- split: train
path: az_0244/train-*
- config_name: az_0245
data_files:
- split: train
path: az_0245/train-*
- config_name: az_0246
data_files:
- split: train
path: az_0246/train-*
- config_name: az_0247
data_files:
- split: train
path: az_0247/train-*
- config_name: az_0248
data_files:
- split: train
path: az_0248/train-*
- config_name: az_0249
data_files:
- split: train
path: az_0249/train-*
- config_name: az_0250
data_files:
- split: train
path: az_0250/train-*
- config_name: az_0251
data_files:
- split: train
path: az_0251/train-*
- config_name: az_0252
data_files:
- split: train
path: az_0252/train-*
- config_name: az_0253
data_files:
- split: train
path: az_0253/train-*
- config_name: az_0254
data_files:
- split: train
path: az_0254/train-*
- config_name: az_0255
data_files:
- split: train
path: az_0255/train-*
- config_name: az_0256
data_files:
- split: train
path: az_0256/train-*
---
提供机构:
HeshamHaroon



