Tonic/EasyReddit

Name: Tonic/EasyReddit
Creator: Tonic
Published: 2023-11-13 12:52:23
License: 暂无描述

Hugging Face2023-11-13 更新2024-03-04 收录

下载链接：

https://hf-mirror.com/datasets/Tonic/EasyReddit

下载链接

链接失效反馈

官方服务：

资源简介：

--- license: mit language: - en tags: - not-for-all-audiences - chemistry - biology - finance - legal - music - art - code - climate - medical pretty_name: Easy Reddit size_categories: - 10M<n<100M configs: - config_name: shards data_files: - split: train path: - shard_1.jsonl - shard_2.jsonl - shard_3.jsonl - shard_4.jsonl - shard_5.jsonl - shard_6.jsonl - shard_7.jsonl - shard_8.jsonl - shard_9.jsonl - shard_10.jsonl - shard_11.jsonl - shard_12.jsonl - shard_13.jsonl - shard_14.jsonl - shard_15.jsonl - shard_16.jsonl - shard_17.jsonl - shard_18.jsonl - shard_19.jsonl - shard_20.jsonl - shard_21.jsonl - shard_22.jsonl - shard_23.jsonl - shard_24.jsonl - shard_25.jsonl - shard_26.jsonl - shard_27.jsonl - shard_28.jsonl - shard_29.jsonl - shard_30.jsonl - shard_31.jsonl - shard_32.jsonl - shard_33.jsonl - shard_34.jsonl --- # 🙋🏻‍♂️Welcome to 🧑🏻‍🚀Tonic's🚀🚰Easy🔴Reddit🔥! ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62a3bb1cd0d8c2c2169f0b88/tsm1OFhNgT4wzIw-_MGQ2.png) This is every "best reddit_question_best_answers" appended and produced according to the following template : ```json {"prompt": "This is the first prompt", "completion": "This is the first completion"} {"prompt": "This is the second prompt", "completion": "This is the second completion"} ``` ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62a3bb1cd0d8c2c2169f0b88/N_RqZSJ32MDIrRGbLcPqm.png) - 🌟 You can use it in shards or all together ! - 🌟 This dataset is **internally consistent** ! 🤔The point is to make it easy to train models with a single correctly formatted dataset of - **54,367,153 rows** # Original Dataset : [nreimers/reddit_question_best_answers](https://huggingface.co/datasets/nreimers/reddit_question_best_answers) # How To Use : Combine random shards in random quantities to produce a very high quality conversational training dataset for fine tuning or try combining rows line by line to save memory by running the following code: ```python # see selectbyline.py import os import random # Directory containing the shard JSONL files shard_directory = "/path/to/shard/directory" # Get a list of all JSONL files in the directory shard_files = [f for f in os.listdir(shard_directory) if f.endswith('.jsonl')] # Function to read a random number of lines (between min_lines and max_lines) from a file def read_random_lines(filename, min_lines, max_lines): selected_lines = [] num_lines = random.randint(min_lines, max_lines) with open(filename, 'r') as file: lines = list(file) if len(lines) <= num_lines: return lines selected_lines = random.sample(lines, num_lines) return selected_lines # Function to combine shards def combine_shards(output_filename, num_combinations): with open(output_filename, 'w') as output_file: for _ in range(num_combinations): selected_shard_file = random.choice(shard_files) lines = read_random_lines(os.path.join(shard_directory, selected_shard_file), 5000, 10000) output_file.writelines(lines) # Example usage combine_shards("/path/to/output/combined_shards.jsonl", 10) ``` # Pre-Processing ```python import json import os import gzip import logging import re import random # Setup basic logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def clean_string(s): """Remove special characters, keeping only alphanumeric characters and spaces.""" if isinstance(s, list): # Extract text from each dictionary in the list and join into a single string s = " ".join([d.get("body", "") if isinstance(d, dict) else str(d) for d in s]) return re.sub(r'[^A-Za-z0-9 ]+', '', s) def process_file(input_file, output_file): try: dataset = [] with gzip.open(input_file, 'rt') as infile: for line in infile: # Parse the JSON line try: data = json.loads(line) except json.JSONDecodeError: logging.error(f"Invalid JSON format in {input_file}: {line}") continue # Extract and clean the 'body' and 'answers' fields prompt = clean_string(data.get("body", "")) completion = clean_string(data.get("answers", "")) # For each body found, make a new row and duplicate the prompt for it if isinstance(data.get("body", ""), list): for body in data.get("body", []): cleaned_body = clean_string(body) dataset.append({"prompt": cleaned_body, "completion": completion}) else: dataset.append({"prompt": prompt, "completion": completion}) # Shuffle the dataset random.shuffle(dataset) # Write the shuffled dataset to the output file with open(output_file, 'a') as outfile: for item in dataset: json.dump(item, outfile) outfile.write('\n') logging.info(f"Processed file: {input_file}") except Exception as e: logging.error(f"Error processing file {input_file}: {e}") def process_files(file_list, output_dir): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) # Create a single output file path output_file = os.path.join(output_dir, 'synthesized_dataset.jsonl') for input_file in file_list: process_file(input_file, output_file) # Update with your list of .gz file paths file_list = [r'C:\Users\MeMyself\FILES, r"C:\Users\MeMyself\FILES" ] # Update with your list of .gz file paths output_dir = r'C:\Users\MeMyself\reddit_question_best_answers\processed' process_files(file_list, output_dir) ``` #### **sharding script** : ```python import json import os def read_dataset(file_path): try: with open(file_path, 'r') as file: data = [json.loads(line) for line in file] print(f"Dataset loaded successfully from {file_path}.") return data except Exception as e: print(f"Error reading dataset from {file_path}: {e}") return [] def shard_dataset(dataset, num_shards): shard_size = len(dataset) // num_shards shards = [dataset[i:i + shard_size] for i in range(0, len(dataset), shard_size)] if len(shards) > num_shards: shards[num_shards - 1].extend(shards.pop()) print(f"Dataset sharded into {num_shards} parts.") return shards def write_shards(shards, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Created output directory at {output_dir}.") for i, shard in enumerate(shards): shard_file = os.path.join(output_dir, f'shard_{i+1}.jsonl') with open(shard_file, 'w') as file: for item in shard: json.dump(item, file) file.write('\n') print(f"Shard {i+1} written to {shard_file}.") def main(): input_file = 'path_to_processed_dataset.jsonl' # Update with your processed dataset file path output_dir = 'sharded_dataset' # Update with your output directory for shards num_shards = 33 dataset = read_dataset(input_file) if dataset: shards = shard_dataset(dataset, num_shards) write_shards(shards, output_dir) print("All shards have been successfully written.") else: print("No dataset to process.") if __name__ == "__main__": main() ``` ### Disclaimer : 🌟Re-format this dataset before use. 🌟Probably there's a **big problem with the token count** on these long answers 😉 🌟**Good Luck !** 🧑🏻‍🚀🚀

提供机构：

Tonic

原始信息汇总

数据集概述

基本信息

许可证：MIT
语言：英语
标签：
- not-for-all-audiences
- chemistry
- biology
- finance
- legal
- music
- art
- code
- climate
- medical
数据集名称：Easy Reddit
数据集大小：10M<n<100M

数据配置

配置名称：shards
数据文件：
- 分割：train
- 路径：
  - shard_1.jsonl
  - shard_2.jsonl
  - shard_3.jsonl
  - shard_4.jsonl
  - shard_5.jsonl
  - shard_6.jsonl
  - shard_7.jsonl
  - shard_8.jsonl
  - shard_9.jsonl
  - shard_10.jsonl
  - shard_11.jsonl
  - shard_12.jsonl
  - shard_13.jsonl
  - shard_14.jsonl
  - shard_15.jsonl
  - shard_16.jsonl
  - shard_17.jsonl
  - shard_18.jsonl
  - shard_19.jsonl
  - shard_20.jsonl
  - shard_21.jsonl
  - shard_22.jsonl
  - shard_23.jsonl
  - shard_24.jsonl
  - shard_25.jsonl
  - shard_26.jsonl
  - shard_27.jsonl
  - shard_28.jsonl
  - shard_29.jsonl
  - shard_30.jsonl
  - shard_31.jsonl
  - shard_32.jsonl
  - shard_33.jsonl
  - shard_34.jsonl

数据集描述

数据格式： json {"prompt": "This is the first prompt", "completion": "This is the first completion"} {"prompt": "This is the second prompt", "completion": "This is the second completion"}
数据集特性：
- 可分片使用或整体使用。
- 数据集内部一致。
数据集规模：54,367,153行

使用方法

组合分片： python import os import random

Directory containing the shard JSONL files

shard_directory = "/path/to/shard/directory"

Get a list of all JSONL files in the directory

shard_files = [f for f in os.listdir(shard_directory) if f.endswith(.jsonl)]

Function to read a random number of lines (between min_lines and max_lines) from a file

def read_random_lines(filename, min_lines, max_lines): selected_lines = [] num_lines = random.randint(min_lines, max_lines)
```
with open(filename, r) as file:
    lines = list(file)
    if len(lines) <= num_lines:
        return lines
    selected_lines = random.sample(lines, num_lines)

return selected_lines
```
Function to combine shards

def combine_shards(output_filename, num_combinations): with open(output_filename, w) as output_file: for _ in range(num_combinations): selected_shard_file = random.choice(shard_files) lines = read_random_lines(os.path.join(shard_directory, selected_shard_file), 5000, 10000) output_file.writelines(lines)

Example usage

combine_shards("/path/to/output/combined_shards.jsonl", 10)

预处理

预处理脚本： python import json import os import gzip import logging import re import random

Setup basic logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def clean_string(s): """Remove special characters, keeping only alphanumeric characters and spaces.""" if isinstance(s, list): # Extract text from each dictionary in the list and join into a single string s = " ".join([d.get("body", "") if isinstance(d, dict) else str(d) for d in s]) return re.sub(r[^A-Za-z0-9 ]+, , s)

def process_file(input_file, output_file): try: dataset = [] with gzip.open(input_file, rt) as infile: for line in infile: # Parse the JSON line try: data = json.loads(line) except json.JSONDecodeError: logging.error(f"Invalid JSON format in {input_file}: {line}") continue

            # Extract and clean the body and answers fields
            prompt = clean_string(data.get("body", ""))
            completion = clean_string(data.get("answers", ""))

            # For each body found, make a new row and duplicate the prompt for it
            if isinstance(data.get("body", ""), list):
                for body in data.get("body", []):
                    cleaned_body = clean_string(body)
                    dataset.append({"prompt": cleaned_body, "completion": completion})
            else:
                dataset.append({"prompt": prompt, "completion": completion})

    # Shuffle the dataset
    random.shuffle(dataset)

    # Write the shuffled dataset to the output file
    with open(output_file, a) as outfile:
        for item in dataset:
            json.dump(item, outfile)
            outfile.write(

)

      logging.info(f"Processed file: {input_file}")

  except Exception as e:
      logging.error(f"Error processing file {input_file}: {e}")

def process_files(file_list, output_dir): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir)

  # Create a single output file path
  output_file = os.path.join(output_dir, synthesized_dataset.jsonl)

  for input_file in file_list:
      process_file(input_file, output_file)

Update with your list of .gz file paths

file_list = [rC:UsersMeMyselfFILES, r"C:UsersMeMyselfFILES" ] # Update with your list of .gz file paths output_dir = rC:UsersMeMyself eddit_question_best_answersprocessed process_files(file_list, output_dir)

分片脚本

分片脚本： python import json import os

def read_dataset(file_path): try: with open(file_path, r) as file: data = [json.loads(line) for line in file] print(f"Dataset loaded successfully from {file_path}.") return data except Exception as e: print(f"Error reading dataset from {file_path}: {e}") return []

def shard_dataset(dataset, num_shards): shard_size = len(dataset) // num_shards shards = [dataset[i:i + shard_size] for i in range(0, len(dataset), shard_size)] if len(shards) > num_shards: shards[num_shards - 1].extend(shards.pop()) print(f"Dataset sharded into {num_shards} parts.") return shards

def write_shards(shards, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Created output directory at {output_dir}.")
```
for i, shard in enumerate(shards):
    shard_file = os.path.join(output_dir, fshard_{i+1}.jsonl)
    with open(shard_file, w) as file:
        for item in shard:
            json.dump(item, file)
            file.write(
```

) print(f"Shard {i+1} written to {shard_file}.")

def main(): input_file = path_to_processed_dataset.jsonl # Update with your processed dataset file path output_dir = sharded_dataset # Update with your output directory for shards num_shards = 33

  dataset = read_dataset(input_file)
  if dataset:
      shards = shard_dataset(dataset, num_shards)
      write_shards(shards, output_dir)
      print("All shards have been successfully written.")
  else:
      print("No dataset to process.")

if name == "main": main()

免责声明

使用前请重新格式化数据集。
可能存在长答案的token计数问题。
祝好运！

5,000+

优质数据集

54 个

任务类型

进入经典数据集