RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output

Name: RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output
Creator: RaiBP
Published: 2024-02-11 13:29:06
License: 暂无描述

Hugging Face2024-02-11 更新2024-03-04 收录

下载链接：

https://hf-mirror.com/datasets/RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output

下载链接

链接失效反馈

官方服务：

资源简介：

--- license: mit --- # Counting bilingual and monolingual instances In order to count bilingual and monolingual instances, we use the following code. We count bilingual instances where there are two languages, one of them is English and the other is either German, French, Spanish, Italian, Portuguese or Dutch. All other instances fall into the "Other" category. ```python from datasets import load_dataset import json from tqdm import tqdm #Specify the dataset name dataset_name = "RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output" # Load the dataset bilingual_dataset = load_dataset(dataset_name, data_dir='bilingual') dataset = bilingual_dataset["train"] n_examples = len(dataset) keys_dict = {} for document in tqdm(dataset, total=n_examples): instance_labels = document["instance_labels"] instance_languages = document["instance_languages"] for languages in instance_languages: unique_languages = list(set(languages)) lang_key = "-".join(sorted(unique_languages)) if lang_key not in keys_dict.keys(): keys_dict[lang_key] = 1 else: keys_dict[lang_key] += 1 english_keys_list = [] # keys where "en" is present non_english_keys_list = [] # keys where "en" is not present for key in keys_dict.keys(): key_list = key.split('-') if "en" in key_list: english_keys_list.append(key_list) else: non_english_keys_list.append(key_list) # more than two languages, none of them English nen_multi_count = 0 # one language, one of the following: de, fr, es, pt, it, nl lang_mono_count = {'de': 0, 'fr': 0, 'es': 0, 'pt': 0, 'it': 0, 'nl': 0} # one language, not one of the following: de, fr, es, pt, it, nl other_mono_count = 0 # two languages, none of them English nen_bi_count = 0 for key in non_english_keys_list: if len(key) > 2: nen_multi_count += keys_dict['-'.join(key)] elif len(key) == 2: nen_bi_count += keys_dict['-'.join(key)] elif len(key) == 1: nen_lang = key[0] if nen_lang in lang_mono_count.keys(): lang_mono_count[nen_lang] += keys_dict[nen_lang] else: other_mono_count += keys_dict[nen_lang] # more than two languages, at least one of them English english_multi_count = 0 # one language, English english_mono_count = 0 for key in english_keys_list: if len(key) == 1 and key[0] == 'en': english_mono_count += keys_dict[key[0]] if len(key) > 2: english_multi_count += keys_dict['-'.join(key)] # two languages, one of them English, the other one not one of the following: de, fr, es, pt, it, nl other_bi_count = 0 # two languages, one of them English, the other one of the following: de, fr, es, pt, it, nl lang_bi_count = {'de': 0, 'fr': 0, 'es': 0, 'pt': 0, 'it': 0, 'nl': 0} for key in english_keys_list: if len(key) == 2: nen_lang = key[0] if key[1] == 'en' else key[1] if nen_lang in lang_bi_count.keys(): lang_bi_count[nen_lang] += keys_dict['-'.join(key)] else: other_bi_count += keys_dict['-'.join(key)] # Save the counts for monolingual counts_dict_monolingual = {"en": english_mono_count} for lang in lang_mono_count.keys(): counts_dict_monolingual[lang] = lang_mono_count[lang] counts_dict_monolingual["other"] = other_mono_count with open('monolingual_counts.json', 'w') as json_file: json.dump(counts_dict_monolingual, json_file) # Save the counts for bilingual counts_dict_bilingual = {} for lang in lang_bi_count.keys(): counts_dict_bilingual[lang] = lang_bi_count[lang] counts_dict_bilingual["other"] = other_bi_count + nen_bi_count + english_multi_count + nen_multi_count with open('bilingual_counts.json', 'w') as json_file: json.dump(counts_dict_bilingual, json_file) ``` # Counting translation instances In order to count translation instances containing English paired with German, French, Spanish, Portuguese, Italian or Dutch, we use: ```python from datasets import load_dataset import json from tqdm import tqdm # Specify the dataset name dataset_name = "RaiBP/openwebtext2-first-30-chunks-lang-detect-raw-output" # Load the dataset translation_dataset = load_dataset(dataset_name, data_dir="translation") dataset = translation_dataset["train"] n_examples = len(dataset) total_instances = 0 counts_dict = {"de": 0, "fr": 0, "es": 0, "pt": 0, "it": 0, "nl": 0} others_count = 0 instances = {} for document in tqdm(dataset, total=n_examples): embedded_label = document["embedded_label"] primary_label = document["primary_label"] document_id = document["document_index"] instance_id = document["instance_index"] id = f"{document_id}-{instance_id}" if id not in instances.keys(): instances[id] = [f"{embedded_label}-{primary_label}"] else: instances[id].append(f"{embedded_label}-{primary_label}") for id, labels in instances.items(): state = 0 found_langs = [] for langs in labels: lang_pair = langs.split("-") if "en" in lang_pair: non_english = lang_pair[0] if lang_pair[1] == "en" else lang_pair[1] if non_english in counts_dict.keys(): state = 1 # found a translation with English and a language in the counts_dict found_langs.append(non_english) elif state != 1: state = 2 # found a translation with English and a language not in the counts_dict elif state != 1: state = 2 # found a translation without English if state == 1: majority_lang = max(set(found_langs), key=found_langs.count) counts_dict[majority_lang] += 1 elif state == 2: others_count += 1 else: print("Error: state is 0") # Specify the file path where you want to save the JSON file file_path = "translation_counts.json" counts_dict["others"] = others_count # Save the dictionary as a JSON file with open(file_path, "w") as json_file: json.dump( counts_dict, json_file, indent=2 ) # indent argument is optional, but it makes the file more human-readable ```

提供机构：

RaiBP

原始信息汇总