Tonic/EasyReddit
收藏数据集概述
基本信息
- 许可证:MIT
- 语言:英语
- 标签:
- not-for-all-audiences
- chemistry
- biology
- finance
- legal
- music
- art
- code
- climate
- medical
- 数据集名称:Easy Reddit
- 数据集大小:10M<n<100M
数据配置
- 配置名称:shards
- 数据文件:
- 分割:train
- 路径:
- shard_1.jsonl
- shard_2.jsonl
- shard_3.jsonl
- shard_4.jsonl
- shard_5.jsonl
- shard_6.jsonl
- shard_7.jsonl
- shard_8.jsonl
- shard_9.jsonl
- shard_10.jsonl
- shard_11.jsonl
- shard_12.jsonl
- shard_13.jsonl
- shard_14.jsonl
- shard_15.jsonl
- shard_16.jsonl
- shard_17.jsonl
- shard_18.jsonl
- shard_19.jsonl
- shard_20.jsonl
- shard_21.jsonl
- shard_22.jsonl
- shard_23.jsonl
- shard_24.jsonl
- shard_25.jsonl
- shard_26.jsonl
- shard_27.jsonl
- shard_28.jsonl
- shard_29.jsonl
- shard_30.jsonl
- shard_31.jsonl
- shard_32.jsonl
- shard_33.jsonl
- shard_34.jsonl
数据集描述
-
数据格式: json {"prompt": "This is the first prompt", "completion": "This is the first completion"} {"prompt": "This is the second prompt", "completion": "This is the second completion"}
-
数据集特性:
- 可分片使用或整体使用。
- 数据集内部一致。
-
数据集规模:54,367,153行
使用方法
-
组合分片: python import os import random
Directory containing the shard JSONL files
shard_directory = "/path/to/shard/directory"
Get a list of all JSONL files in the directory
shard_files = [f for f in os.listdir(shard_directory) if f.endswith(.jsonl)]
Function to read a random number of lines (between min_lines and max_lines) from a file
def read_random_lines(filename, min_lines, max_lines): selected_lines = [] num_lines = random.randint(min_lines, max_lines)
with open(filename, r) as file: lines = list(file) if len(lines) <= num_lines: return lines selected_lines = random.sample(lines, num_lines) return selected_linesFunction to combine shards
def combine_shards(output_filename, num_combinations): with open(output_filename, w) as output_file: for _ in range(num_combinations): selected_shard_file = random.choice(shard_files) lines = read_random_lines(os.path.join(shard_directory, selected_shard_file), 5000, 10000) output_file.writelines(lines)
Example usage
combine_shards("/path/to/output/combined_shards.jsonl", 10)
预处理
-
预处理脚本: python import json import os import gzip import logging import re import random
Setup basic logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
def clean_string(s): """Remove special characters, keeping only alphanumeric characters and spaces.""" if isinstance(s, list): # Extract text from each dictionary in the list and join into a single string s = " ".join([d.get("body", "") if isinstance(d, dict) else str(d) for d in s]) return re.sub(r[^A-Za-z0-9 ]+, , s)
def process_file(input_file, output_file): try: dataset = [] with gzip.open(input_file, rt) as infile: for line in infile: # Parse the JSON line try: data = json.loads(line) except json.JSONDecodeError: logging.error(f"Invalid JSON format in {input_file}: {line}") continue
# Extract and clean the body and answers fields prompt = clean_string(data.get("body", "")) completion = clean_string(data.get("answers", "")) # For each body found, make a new row and duplicate the prompt for it if isinstance(data.get("body", ""), list): for body in data.get("body", []): cleaned_body = clean_string(body) dataset.append({"prompt": cleaned_body, "completion": completion}) else: dataset.append({"prompt": prompt, "completion": completion}) # Shuffle the dataset random.shuffle(dataset) # Write the shuffled dataset to the output file with open(output_file, a) as outfile: for item in dataset: json.dump(item, outfile) outfile.write(
)
logging.info(f"Processed file: {input_file}")
except Exception as e:
logging.error(f"Error processing file {input_file}: {e}")
def process_files(file_list, output_dir): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir)
# Create a single output file path
output_file = os.path.join(output_dir, synthesized_dataset.jsonl)
for input_file in file_list:
process_file(input_file, output_file)
Update with your list of .gz file paths
file_list = [rC:UsersMeMyselfFILES, r"C:UsersMeMyselfFILES" ] # Update with your list of .gz file paths output_dir = rC:UsersMeMyself eddit_question_best_answersprocessed process_files(file_list, output_dir)
分片脚本
-
分片脚本: python import json import os
def read_dataset(file_path): try: with open(file_path, r) as file: data = [json.loads(line) for line in file] print(f"Dataset loaded successfully from {file_path}.") return data except Exception as e: print(f"Error reading dataset from {file_path}: {e}") return []
def shard_dataset(dataset, num_shards): shard_size = len(dataset) // num_shards shards = [dataset[i:i + shard_size] for i in range(0, len(dataset), shard_size)] if len(shards) > num_shards: shards[num_shards - 1].extend(shards.pop()) print(f"Dataset sharded into {num_shards} parts.") return shards
def write_shards(shards, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Created output directory at {output_dir}.")
for i, shard in enumerate(shards): shard_file = os.path.join(output_dir, fshard_{i+1}.jsonl) with open(shard_file, w) as file: for item in shard: json.dump(item, file) file.write(
) print(f"Shard {i+1} written to {shard_file}.")
def main(): input_file = path_to_processed_dataset.jsonl # Update with your processed dataset file path output_dir = sharded_dataset # Update with your output directory for shards num_shards = 33
dataset = read_dataset(input_file)
if dataset:
shards = shard_dataset(dataset, num_shards)
write_shards(shards, output_dir)
print("All shards have been successfully written.")
else:
print("No dataset to process.")
if name == "main": main()
免责声明
- 使用前请重新格式化数据集。
- 可能存在长答案的token计数问题。
- 祝好运!



