mylesgoose/alpaca-cleaned-gpt4-turbo
收藏数据集概述
基本信息
- 许可证: cc-by-4.0
- 语言: 英语
- 标签: 化学、生物学、金融、艺术、代码、webdataset、instruction-finetuning、gpt4-turbo
- 数据量: 10K<n<100K
数据格式
- 格式: SQLite 和 JSON
- 内容: JSON 格式仅包含新的 GPT 输出。
数据内容
- 数据行数: 51760 行
- 示例数据:
- Instruction: 给定一段文本,你需要输出文本中的陈述是观点还是事实。观点是定义为无法证明真假的陈述,通常基于某人的信仰。事实是定义为可以证明真假且不基于某人信仰的陈述。
- Input: 文本: 今天天空非常多云。
- Original Output: 文本中的陈述是事实。
- GPT Output: 事实
数据处理
- 数据来源: 从 Alpaca 下载并处理,移除重复的哈希及其对应的输入和输出列。
- 处理步骤:
- 使用 GPT-4 Turbo API 生成响应。
- 将数据存储在 SQLite 数据库中,包含原始列和新的 GPT 输出。
- 将 SQLite 数据库转换回 JSON 格式。
代码示例
-
创建数据库: python import json import hashlib import sqlite3
def generate_hash(text): return hashlib.sha256(text.encode()).hexdigest()
with open(/home/myles1/alpaca-cleaned/original/alpaca-cleaned/alpaca_data_cleaned.json, r) as file: data = json.load(file)
conn = sqlite3.connect(/home/myles1/alpaca-cleaned/original/alpaca-cleaned/database.db) cursor = conn.cursor()
cursor.execute(CREATE TABLE IF NOT EXISTS entries (line_number INTEGER PRIMARY KEY, instruction TEXT, input TEXT, original_output TEXT, gpt_output TEXT, hash TEXT))
for idx, entry in enumerate(data): instruction = entry[instruction] input_text = entry[input] original_output = entry[output] hash_value = generate_hash(instruction + input_text + original_output) cursor.execute(INSERT INTO entries (line_number, instruction, input, original_output, gpt_output, hash) VALUES (?, ?, ?, ?, , ?), (idx + 1, instruction, input_text, original_output, hash_value))
conn.commit() conn.close()
-
查询 GPT-4 Turbo API: python import sqlite3 import json import hashlib import concurrent.futures from openai import OpenAI
def generate_hash(text): return hashlib.sha256(text.encode()).hexdigest()
last_processed_row = 0
def process_row(row): line_number, instruction, input_text, original_output, gpt_output, instruction_hash, input_hash, output_hash, gpt_output_hash = row
print(f"Processing row {line_number}...") if gpt_output_hash == e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855: print(f"Updating row {line_number} with new GPT output...") conn = sqlite3.connect(/home/myles1/alpaca-cleaned/data/database.db) cursor = conn.cursor() client = OpenAI() message = f"{instruction}
{input_text}" completion = client.chat.completions.create(model="gpt-4-turbo", messages=[{"role": "system", "content": message}]) gpt_output = completion.choices[0].message
print(f"New GPT output: {gpt_output}")
cursor.execute("UPDATE entries SET gpt_output=?, gpt_output_hash=? WHERE line_number=?",
(gpt_output.content, generate_hash(gpt_output.content), line_number))
print(f"Updated row {line_number} successfully!")
conn.commit()
conn.close()
global last_processed_row
last_processed_row = line_number
conn = sqlite3.connect(/home/myles1/alpaca-cleaned/data/database.db) cursor = conn.cursor() rows = cursor.execute("SELECT * FROM entries")
with concurrent.futures.ThreadPoolExecutor(max_workers=70) as executor: futures = {executor.submit(process_row, row): row for row in rows}
for future in concurrent.futures.as_completed(futures):
row = futures[future]
try:
future.result()
except Exception as e:
print(f"An error occurred: {e}")
conn.close()
print("Database updated successfully!")
-
转换回 JSON 格式: python import sqlite3 import json
conn = sqlite3.connect(/home/myles1/alpaca-cleaned/data/database copy 5.db) cursor = conn.cursor()
cursor.execute("SELECT instruction, input, gpt_output FROM entries") entries = cursor.fetchall()
conn.close()
output_data = [] for idx, entry in enumerate(entries): instruction, input_text, gpt_output = entry formatted_entry = { "instruction": instruction, "input": input_text, "output": gpt_output } output_data.append(formatted_entry)
with open(/home/myles1/alpaca-cleaned/output/alpaca_data_extracted.json, w) as file: json.dump(output_data, file, indent=4)
print("JSON data extracted successfully to output.json.")




