five

Tonic/EasyReddit

收藏
Hugging Face2023-11-13 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/Tonic/EasyReddit
下载链接
链接失效反馈
官方服务:
资源简介:
--- license: mit language: - en tags: - not-for-all-audiences - chemistry - biology - finance - legal - music - art - code - climate - medical pretty_name: Easy Reddit size_categories: - 10M<n<100M configs: - config_name: shards data_files: - split: train path: - shard_1.jsonl - shard_2.jsonl - shard_3.jsonl - shard_4.jsonl - shard_5.jsonl - shard_6.jsonl - shard_7.jsonl - shard_8.jsonl - shard_9.jsonl - shard_10.jsonl - shard_11.jsonl - shard_12.jsonl - shard_13.jsonl - shard_14.jsonl - shard_15.jsonl - shard_16.jsonl - shard_17.jsonl - shard_18.jsonl - shard_19.jsonl - shard_20.jsonl - shard_21.jsonl - shard_22.jsonl - shard_23.jsonl - shard_24.jsonl - shard_25.jsonl - shard_26.jsonl - shard_27.jsonl - shard_28.jsonl - shard_29.jsonl - shard_30.jsonl - shard_31.jsonl - shard_32.jsonl - shard_33.jsonl - shard_34.jsonl --- # 🙋🏻‍♂️Welcome to 🧑🏻‍🚀Tonic's🚀🚰Easy🔴Reddit🔥! ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62a3bb1cd0d8c2c2169f0b88/tsm1OFhNgT4wzIw-_MGQ2.png) This is every "best reddit_question_best_answers" appended and produced according to the following template : ```json {"prompt": "This is the first prompt", "completion": "This is the first completion"} {"prompt": "This is the second prompt", "completion": "This is the second completion"} ``` ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62a3bb1cd0d8c2c2169f0b88/N_RqZSJ32MDIrRGbLcPqm.png) - 🌟 You can use it in shards or all together ! - 🌟 This dataset is **internally consistent** ! 🤔The point is to make it easy to train models with a single correctly formatted dataset of - **54,367,153 rows** # Original Dataset : [nreimers/reddit_question_best_answers](https://huggingface.co/datasets/nreimers/reddit_question_best_answers) # How To Use : Combine random shards in random quantities to produce a very high quality conversational training dataset for fine tuning or try combining rows line by line to save memory by running the following code: ```python # see selectbyline.py import os import random # Directory containing the shard JSONL files shard_directory = "/path/to/shard/directory" # Get a list of all JSONL files in the directory shard_files = [f for f in os.listdir(shard_directory) if f.endswith('.jsonl')] # Function to read a random number of lines (between min_lines and max_lines) from a file def read_random_lines(filename, min_lines, max_lines): selected_lines = [] num_lines = random.randint(min_lines, max_lines) with open(filename, 'r') as file: lines = list(file) if len(lines) <= num_lines: return lines selected_lines = random.sample(lines, num_lines) return selected_lines # Function to combine shards def combine_shards(output_filename, num_combinations): with open(output_filename, 'w') as output_file: for _ in range(num_combinations): selected_shard_file = random.choice(shard_files) lines = read_random_lines(os.path.join(shard_directory, selected_shard_file), 5000, 10000) output_file.writelines(lines) # Example usage combine_shards("/path/to/output/combined_shards.jsonl", 10) ``` # Pre-Processing ```python import json import os import gzip import logging import re import random # Setup basic logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def clean_string(s): """Remove special characters, keeping only alphanumeric characters and spaces.""" if isinstance(s, list): # Extract text from each dictionary in the list and join into a single string s = " ".join([d.get("body", "") if isinstance(d, dict) else str(d) for d in s]) return re.sub(r'[^A-Za-z0-9 ]+', '', s) def process_file(input_file, output_file): try: dataset = [] with gzip.open(input_file, 'rt') as infile: for line in infile: # Parse the JSON line try: data = json.loads(line) except json.JSONDecodeError: logging.error(f"Invalid JSON format in {input_file}: {line}") continue # Extract and clean the 'body' and 'answers' fields prompt = clean_string(data.get("body", "")) completion = clean_string(data.get("answers", "")) # For each body found, make a new row and duplicate the prompt for it if isinstance(data.get("body", ""), list): for body in data.get("body", []): cleaned_body = clean_string(body) dataset.append({"prompt": cleaned_body, "completion": completion}) else: dataset.append({"prompt": prompt, "completion": completion}) # Shuffle the dataset random.shuffle(dataset) # Write the shuffled dataset to the output file with open(output_file, 'a') as outfile: for item in dataset: json.dump(item, outfile) outfile.write('\n') logging.info(f"Processed file: {input_file}") except Exception as e: logging.error(f"Error processing file {input_file}: {e}") def process_files(file_list, output_dir): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) # Create a single output file path output_file = os.path.join(output_dir, 'synthesized_dataset.jsonl') for input_file in file_list: process_file(input_file, output_file) # Update with your list of .gz file paths file_list = [r'C:\Users\MeMyself\FILES, r"C:\Users\MeMyself\FILES" ] # Update with your list of .gz file paths output_dir = r'C:\Users\MeMyself\reddit_question_best_answers\processed' process_files(file_list, output_dir) ``` #### **sharding script** : ```python import json import os def read_dataset(file_path): try: with open(file_path, 'r') as file: data = [json.loads(line) for line in file] print(f"Dataset loaded successfully from {file_path}.") return data except Exception as e: print(f"Error reading dataset from {file_path}: {e}") return [] def shard_dataset(dataset, num_shards): shard_size = len(dataset) // num_shards shards = [dataset[i:i + shard_size] for i in range(0, len(dataset), shard_size)] if len(shards) > num_shards: shards[num_shards - 1].extend(shards.pop()) print(f"Dataset sharded into {num_shards} parts.") return shards def write_shards(shards, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Created output directory at {output_dir}.") for i, shard in enumerate(shards): shard_file = os.path.join(output_dir, f'shard_{i+1}.jsonl') with open(shard_file, 'w') as file: for item in shard: json.dump(item, file) file.write('\n') print(f"Shard {i+1} written to {shard_file}.") def main(): input_file = 'path_to_processed_dataset.jsonl' # Update with your processed dataset file path output_dir = 'sharded_dataset' # Update with your output directory for shards num_shards = 33 dataset = read_dataset(input_file) if dataset: shards = shard_dataset(dataset, num_shards) write_shards(shards, output_dir) print("All shards have been successfully written.") else: print("No dataset to process.") if __name__ == "__main__": main() ``` ### Disclaimer : 🌟Re-format this dataset before use. 🌟Probably there's a **big problem with the token count** on these long answers 😉 🌟**Good Luck !** 🧑🏻‍🚀🚀
提供机构:
Tonic
原始信息汇总

数据集概述

基本信息

  • 许可证:MIT
  • 语言:英语
  • 标签
    • not-for-all-audiences
    • chemistry
    • biology
    • finance
    • legal
    • music
    • art
    • code
    • climate
    • medical
  • 数据集名称:Easy Reddit
  • 数据集大小:10M<n<100M

数据配置

  • 配置名称:shards
  • 数据文件
    • 分割:train
    • 路径
      • shard_1.jsonl
      • shard_2.jsonl
      • shard_3.jsonl
      • shard_4.jsonl
      • shard_5.jsonl
      • shard_6.jsonl
      • shard_7.jsonl
      • shard_8.jsonl
      • shard_9.jsonl
      • shard_10.jsonl
      • shard_11.jsonl
      • shard_12.jsonl
      • shard_13.jsonl
      • shard_14.jsonl
      • shard_15.jsonl
      • shard_16.jsonl
      • shard_17.jsonl
      • shard_18.jsonl
      • shard_19.jsonl
      • shard_20.jsonl
      • shard_21.jsonl
      • shard_22.jsonl
      • shard_23.jsonl
      • shard_24.jsonl
      • shard_25.jsonl
      • shard_26.jsonl
      • shard_27.jsonl
      • shard_28.jsonl
      • shard_29.jsonl
      • shard_30.jsonl
      • shard_31.jsonl
      • shard_32.jsonl
      • shard_33.jsonl
      • shard_34.jsonl

数据集描述

  • 数据格式: json {"prompt": "This is the first prompt", "completion": "This is the first completion"} {"prompt": "This is the second prompt", "completion": "This is the second completion"}

  • 数据集特性

    • 可分片使用或整体使用。
    • 数据集内部一致。
  • 数据集规模:54,367,153行

使用方法

  • 组合分片: python import os import random

    Directory containing the shard JSONL files

    shard_directory = "/path/to/shard/directory"

    Get a list of all JSONL files in the directory

    shard_files = [f for f in os.listdir(shard_directory) if f.endswith(.jsonl)]

    Function to read a random number of lines (between min_lines and max_lines) from a file

    def read_random_lines(filename, min_lines, max_lines): selected_lines = [] num_lines = random.randint(min_lines, max_lines)

    with open(filename, r) as file:
        lines = list(file)
        if len(lines) <= num_lines:
            return lines
        selected_lines = random.sample(lines, num_lines)
    
    return selected_lines
    

    Function to combine shards

    def combine_shards(output_filename, num_combinations): with open(output_filename, w) as output_file: for _ in range(num_combinations): selected_shard_file = random.choice(shard_files) lines = read_random_lines(os.path.join(shard_directory, selected_shard_file), 5000, 10000) output_file.writelines(lines)

    Example usage

    combine_shards("/path/to/output/combined_shards.jsonl", 10)

预处理

  • 预处理脚本: python import json import os import gzip import logging import re import random

    Setup basic logging

    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

    def clean_string(s): """Remove special characters, keeping only alphanumeric characters and spaces.""" if isinstance(s, list): # Extract text from each dictionary in the list and join into a single string s = " ".join([d.get("body", "") if isinstance(d, dict) else str(d) for d in s]) return re.sub(r[^A-Za-z0-9 ]+, , s)

    def process_file(input_file, output_file): try: dataset = [] with gzip.open(input_file, rt) as infile: for line in infile: # Parse the JSON line try: data = json.loads(line) except json.JSONDecodeError: logging.error(f"Invalid JSON format in {input_file}: {line}") continue

                # Extract and clean the body and answers fields
                prompt = clean_string(data.get("body", ""))
                completion = clean_string(data.get("answers", ""))
    
                # For each body found, make a new row and duplicate the prompt for it
                if isinstance(data.get("body", ""), list):
                    for body in data.get("body", []):
                        cleaned_body = clean_string(body)
                        dataset.append({"prompt": cleaned_body, "completion": completion})
                else:
                    dataset.append({"prompt": prompt, "completion": completion})
    
        # Shuffle the dataset
        random.shuffle(dataset)
    
        # Write the shuffled dataset to the output file
        with open(output_file, a) as outfile:
            for item in dataset:
                json.dump(item, outfile)
                outfile.write(
    

)

      logging.info(f"Processed file: {input_file}")

  except Exception as e:
      logging.error(f"Error processing file {input_file}: {e}")

def process_files(file_list, output_dir): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir)

  # Create a single output file path
  output_file = os.path.join(output_dir, synthesized_dataset.jsonl)

  for input_file in file_list:
      process_file(input_file, output_file)

Update with your list of .gz file paths

file_list = [rC:UsersMeMyselfFILES, r"C:UsersMeMyselfFILES" ] # Update with your list of .gz file paths output_dir = rC:UsersMeMyself eddit_question_best_answersprocessed process_files(file_list, output_dir)

分片脚本

  • 分片脚本: python import json import os

    def read_dataset(file_path): try: with open(file_path, r) as file: data = [json.loads(line) for line in file] print(f"Dataset loaded successfully from {file_path}.") return data except Exception as e: print(f"Error reading dataset from {file_path}: {e}") return []

    def shard_dataset(dataset, num_shards): shard_size = len(dataset) // num_shards shards = [dataset[i:i + shard_size] for i in range(0, len(dataset), shard_size)] if len(shards) > num_shards: shards[num_shards - 1].extend(shards.pop()) print(f"Dataset sharded into {num_shards} parts.") return shards

    def write_shards(shards, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Created output directory at {output_dir}.")

    for i, shard in enumerate(shards):
        shard_file = os.path.join(output_dir, fshard_{i+1}.jsonl)
        with open(shard_file, w) as file:
            for item in shard:
                json.dump(item, file)
                file.write(
    

) print(f"Shard {i+1} written to {shard_file}.")

def main(): input_file = path_to_processed_dataset.jsonl # Update with your processed dataset file path output_dir = sharded_dataset # Update with your output directory for shards num_shards = 33

  dataset = read_dataset(input_file)
  if dataset:
      shards = shard_dataset(dataset, num_shards)
      write_shards(shards, output_dir)
      print("All shards have been successfully written.")
  else:
      print("No dataset to process.")

if name == "main": main()

免责声明

  • 使用前请重新格式化数据集。
  • 可能存在长答案的token计数问题。
  • 祝好运!
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作