RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output
收藏Hugging Face2024-02-11 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output
下载链接
链接失效反馈官方服务:
资源简介:
---
license: mit
---
# Counting bilingual and monolingual instances
In order to count bilingual and monolingual instances, we use the following code. We count bilingual instances where there are two languages, one of them is English and the other is either German, French, Spanish, Italian, Portuguese or Dutch. All other instances fall into the "Other" category.
```python
from datasets import load_dataset
import json
from tqdm import tqdm
#Specify the dataset name
dataset_name = "RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output"
# Load the dataset
bilingual_dataset = load_dataset(dataset_name, data_dir='bilingual')
dataset = bilingual_dataset["train"]
n_examples = len(dataset)
keys_dict = {}
for document in tqdm(dataset, total=n_examples):
instance_labels = document["instance_labels"]
instance_languages = document["instance_languages"]
for languages in instance_languages:
unique_languages = list(set(languages))
lang_key = "-".join(sorted(unique_languages))
if lang_key not in keys_dict.keys():
keys_dict[lang_key] = 1
else:
keys_dict[lang_key] += 1
english_keys_list = [] # keys where "en" is present
non_english_keys_list = [] # keys where "en" is not present
for key in keys_dict.keys():
key_list = key.split('-')
if "en" in key_list:
english_keys_list.append(key_list)
else:
non_english_keys_list.append(key_list)
# more than two languages, none of them English
nen_multi_count = 0
# one language, one of the following: de, fr, es, pt, it, nl
lang_mono_count = {'de': 0, 'fr': 0, 'es': 0, 'pt': 0, 'it': 0, 'nl': 0}
# one language, not one of the following: de, fr, es, pt, it, nl
other_mono_count = 0
# two languages, none of them English
nen_bi_count = 0
for key in non_english_keys_list:
if len(key) > 2:
nen_multi_count += keys_dict['-'.join(key)]
elif len(key) == 2:
nen_bi_count += keys_dict['-'.join(key)]
elif len(key) == 1:
nen_lang = key[0]
if nen_lang in lang_mono_count.keys():
lang_mono_count[nen_lang] += keys_dict[nen_lang]
else:
other_mono_count += keys_dict[nen_lang]
# more than two languages, at least one of them English
english_multi_count = 0
# one language, English
english_mono_count = 0
for key in english_keys_list:
if len(key) == 1 and key[0] == 'en':
english_mono_count += keys_dict[key[0]]
if len(key) > 2:
english_multi_count += keys_dict['-'.join(key)]
# two languages, one of them English, the other one not one of the following: de, fr, es, pt, it, nl
other_bi_count = 0
# two languages, one of them English, the other one of the following: de, fr, es, pt, it, nl
lang_bi_count = {'de': 0, 'fr': 0, 'es': 0, 'pt': 0, 'it': 0, 'nl': 0}
for key in english_keys_list:
if len(key) == 2:
nen_lang = key[0] if key[1] == 'en' else key[1]
if nen_lang in lang_bi_count.keys():
lang_bi_count[nen_lang] += keys_dict['-'.join(key)]
else:
other_bi_count += keys_dict['-'.join(key)]
# Save the counts for monolingual
counts_dict_monolingual = {"en": english_mono_count}
for lang in lang_mono_count.keys():
counts_dict_monolingual[lang] = lang_mono_count[lang]
counts_dict_monolingual["other"] = other_mono_count
with open('monolingual_counts.json', 'w') as json_file:
json.dump(counts_dict_monolingual, json_file)
# Save the counts for bilingual
counts_dict_bilingual = {}
for lang in lang_bi_count.keys():
counts_dict_bilingual[lang] = lang_bi_count[lang]
counts_dict_bilingual["other"] = other_bi_count + nen_bi_count + english_multi_count + nen_multi_count
with open('bilingual_counts.json', 'w') as json_file:
json.dump(counts_dict_bilingual, json_file)
```
# Counting translation instances
In order to count translation instances containing English paired with German, French, Spanish, Portuguese, Italian or Dutch, we use:
```python
from datasets import load_dataset
import json
from tqdm import tqdm
# Specify the dataset name
dataset_name = "RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output"
# Load the dataset
translation_dataset = load_dataset(dataset_name, data_dir="translation")
dataset = translation_dataset["train"]
n_examples = len(dataset)
total_instances = 0
counts_dict = {"de": 0, "fr": 0, "es": 0, "pt": 0, "it": 0, "nl": 0}
others_count = 0
instances = {}
for document in tqdm(dataset, total=n_examples):
embedded_label = document["embedded_label"]
primary_label = document["primary_label"]
document_id = document["document_index"]
instance_id = document["instance_index"]
id = f"{document_id}-{instance_id}"
if id not in instances.keys():
instances[id] = [f"{embedded_label}-{primary_label}"]
else:
instances[id].append(f"{embedded_label}-{primary_label}")
for id, labels in instances.items():
state = 0
found_langs = []
for langs in labels:
lang_pair = langs.split("-")
if "en" in lang_pair:
non_english = lang_pair[0] if lang_pair[1] == "en" else lang_pair[1]
if non_english in counts_dict.keys():
state = 1 # found a translation with English and a language in the counts_dict
found_langs.append(non_english)
elif state != 1:
state = 2 # found a translation with English and a language not in the counts_dict
elif state != 1:
state = 2 # found a translation without English
if state == 1:
majority_lang = max(set(found_langs), key=found_langs.count)
counts_dict[majority_lang] += 1
elif state == 2:
others_count += 1
else:
print("Error: state is 0")
# Specify the file path where you want to save the JSON file
file_path = "translation_counts.json"
counts_dict["others"] = others_count
# Save the dictionary as a JSON file
with open(file_path, "w") as json_file:
json.dump(
counts_dict, json_file, indent=2
) # indent argument is optional, but it makes the file more human-readable
```
提供机构:
RaiBP
原始信息汇总
数据集概述
数据集名称
- 名称: RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output
数据集处理
双语和单语实例计数
- 目的: 统计双语和单语实例,其中双语实例包含英语和另一种语言(德语、法语、西班牙语、意大利语、葡萄牙语或荷兰语),其他实例归入“其他”类别。
- 方法:
- 加载数据集并遍历每个文档。
- 统计不同语言组合的实例数量。
- 将统计结果保存为JSON文件。
翻译实例计数
- 目的: 统计包含英语与德语、法语、西班牙语、葡萄牙语、意大利语或荷兰语配对的翻译实例。
- 方法:
- 加载数据集并遍历每个文档。
- 统计包含英语与其他指定语言的翻译实例数量。
- 将统计结果保存为JSON文件。



